diff --git a/.ci/aarch64_linux/aarch64_ci_build.sh b/.ci/aarch64_linux/aarch64_ci_build.sh index bf8bab6dde232..a0eb0b72df2b3 100644 --- a/.ci/aarch64_linux/aarch64_ci_build.sh +++ b/.ci/aarch64_linux/aarch64_ci_build.sh @@ -5,9 +5,9 @@ GPU_ARCH_VERSION=${GPU_ARCH_VERSION:-} # Set CUDA architecture lists to match x86 build_cuda.sh if [[ "$GPU_ARCH_VERSION" == *"12.6"* ]]; then - export TORCH_CUDA_ARCH_LIST="5.0;6.0;7.0;8.0;9.0" + export TORCH_CUDA_ARCH_LIST="8.0;9.0" elif [[ "$GPU_ARCH_VERSION" == *"12.8"* ]]; then - export TORCH_CUDA_ARCH_LIST="7.0;8.0;9.0;10.0;12.0" + export TORCH_CUDA_ARCH_LIST="8.0;9.0;10.0;12.0" elif [[ "$GPU_ARCH_VERSION" == *"13.0"* ]]; then export TORCH_CUDA_ARCH_LIST="8.0;9.0;10.0;11.0;12.0+PTX" fi @@ -31,8 +31,7 @@ pip install -r /pytorch/requirements.txt pip install auditwheel==6.2.0 wheel if [ "$DESIRED_CUDA" = "cpu" ]; then echo "BASE_CUDA_VERSION is not set. Building cpu wheel." - #USE_PRIORITIZED_TEXT_FOR_LD for enable linker script optimization https://github.com/pytorch/pytorch/pull/121975/files - USE_PRIORITIZED_TEXT_FOR_LD=1 python /pytorch/.ci/aarch64_linux/aarch64_wheel_ci_build.py --enable-mkldnn + python /pytorch/.ci/aarch64_linux/aarch64_wheel_ci_build.py --enable-mkldnn else echo "BASE_CUDA_VERSION is set to: $DESIRED_CUDA" export USE_SYSTEM_NCCL=1 @@ -42,13 +41,9 @@ else echo "Bundling CUDA libraries with wheel for aarch64." else echo "Using nvidia libs from pypi for aarch64." - # Fix platform constraints in PYTORCH_EXTRA_INSTALL_REQUIREMENTS for aarch64 - # Replace 'platform_machine == "x86_64"' with 'platform_machine == "aarch64"' - export PYTORCH_EXTRA_INSTALL_REQUIREMENTS="${PYTORCH_EXTRA_INSTALL_REQUIREMENTS//platform_machine == \'x86_64\'/platform_machine == \'aarch64\'}" echo "Updated PYTORCH_EXTRA_INSTALL_REQUIREMENTS for aarch64: $PYTORCH_EXTRA_INSTALL_REQUIREMENTS" export USE_NVIDIA_PYPI_LIBS=1 fi - #USE_PRIORITIZED_TEXT_FOR_LD for enable linker script optimization https://github.com/pytorch/pytorch/pull/121975/files - USE_PRIORITIZED_TEXT_FOR_LD=1 python /pytorch/.ci/aarch64_linux/aarch64_wheel_ci_build.py --enable-mkldnn --enable-cuda + python /pytorch/.ci/aarch64_linux/aarch64_wheel_ci_build.py --enable-mkldnn --enable-cuda fi diff --git a/.ci/aarch64_linux/aarch64_wheel_ci_build.py b/.ci/aarch64_linux/aarch64_wheel_ci_build.py index 4bb9c64ea7772..d4afea81ac0b4 100755 --- a/.ci/aarch64_linux/aarch64_wheel_ci_build.py +++ b/.ci/aarch64_linux/aarch64_wheel_ci_build.py @@ -138,6 +138,8 @@ def package_cuda_wheel(wheel_path, desired_cuda) -> None: folder = os.path.dirname(wheel_path) os.mkdir(f"{folder}/tmp") os.system(f"unzip {wheel_path} -d {folder}/tmp") + # Delete original wheel since it will be repackaged + os.system(f"rm {wheel_path}") # Check if we should use PyPI NVIDIA libraries or bundle system libraries use_nvidia_pypi_libs = os.getenv("USE_NVIDIA_PYPI_LIBS", "0") == "1" @@ -211,7 +213,8 @@ def package_cuda_wheel(wheel_path, desired_cuda) -> None: ] # CUDA version-specific libraries - if "130" in desired_cuda: + if "13" in desired_cuda: + minor_version = desired_cuda[-1] version_specific_libs = [ "/usr/local/cuda/extras/CUPTI/lib64/libcupti.so.13", "/usr/local/cuda/lib64/libcublas.so.13", @@ -221,7 +224,7 @@ def package_cuda_wheel(wheel_path, desired_cuda) -> None: "/usr/local/cuda/lib64/libcusolver.so.12", "/usr/local/cuda/lib64/libnvJitLink.so.13", "/usr/local/cuda/lib64/libnvrtc.so.13", - "/usr/local/cuda/lib64/libnvrtc-builtins.so.13.0", + f"/usr/local/cuda/lib64/libnvrtc-builtins.so.13.{minor_version}", ] elif "12" in desired_cuda: # Get the last character for libnvrtc-builtins version (e.g., "129" -> "9") @@ -237,6 +240,8 @@ def package_cuda_wheel(wheel_path, desired_cuda) -> None: "/usr/local/cuda/lib64/libnvrtc.so.12", f"/usr/local/cuda/lib64/libnvrtc-builtins.so.12.{minor_version}", ] + else: + raise ValueError(f"Unsupported CUDA version: {desired_cuda}.") # Combine all libraries libs_to_copy = common_libs + version_specific_libs @@ -275,14 +280,7 @@ def complete_wheel(folder: str) -> str: f"/{folder}/dist/{repaired_wheel_name}", ) else: - repaired_wheel_name = wheel_name.replace( - "linux_aarch64", "manylinux_2_28_aarch64" - ) - print(f"Renaming {wheel_name} wheel to {repaired_wheel_name}") - os.rename( - f"/{folder}/dist/{wheel_name}", - f"/{folder}/dist/{repaired_wheel_name}", - ) + repaired_wheel_name = list_dir(f"/{folder}/dist")[0] print(f"Copying {repaired_wheel_name} to artifacts") shutil.copy2( @@ -319,7 +317,7 @@ def parse_arguments(): ).decode() print("Building PyTorch wheel") - build_vars = "CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000 " + build_vars = "" # MAX_JOB=5 is not required for CPU backend (see commit 465d98b) if enable_cuda: build_vars += "MAX_JOBS=5 " diff --git a/.ci/aarch64_linux/build_aarch64_wheel.py b/.ci/aarch64_linux/build_aarch64_wheel.py index 7a4715d330060..52525f14460da 100755 --- a/.ci/aarch64_linux/build_aarch64_wheel.py +++ b/.ci/aarch64_linux/build_aarch64_wheel.py @@ -241,7 +241,7 @@ def wait_for_connection(addr, port, timeout=15, attempt_cnt=5): try: with socket.create_connection((addr, port), timeout=timeout): return - except (ConnectionRefusedError, socket.timeout): # noqa: PERF203 + except (ConnectionRefusedError, TimeoutError): # noqa: PERF203 if i == attempt_cnt - 1: raise time.sleep(timeout) @@ -1004,7 +1004,7 @@ def parse_arguments(): install_condaforge_python(host, args.python_version) sys.exit(0) - python_version = args.python_version if args.python_version is not None else "3.9" + python_version = args.python_version if args.python_version is not None else "3.10" if args.use_torch_from_pypi: configure_system(host, compiler=args.compiler, python_version=python_version) diff --git a/.ci/docker/build.sh b/.ci/docker/build.sh index 48be0cf538054..6ebff8d531e9f 100755 --- a/.ci/docker/build.sh +++ b/.ci/docker/build.sh @@ -214,8 +214,7 @@ case "$tag" in TRITON=yes ;; pytorch-linux-jammy-py3-gcc11-inductor-benchmarks) - # TODO (huydhn): Upgrade this to Python >= 3.10 - ANACONDA_PYTHON_VERSION=3.9 + ANACONDA_PYTHON_VERSION=3.10 GCC_VERSION=11 VISION=yes KATEX=yes @@ -263,13 +262,10 @@ case "$tag" in TRITON_CPU=yes ;; pytorch-linux-jammy-linter) - # TODO: Use 3.9 here because of this issue https://github.com/python/mypy/issues/13627. - # We will need to update mypy version eventually, but that's for another day. The task - # would be to upgrade mypy to 1.0.0 with Python 3.11 - PYTHON_VERSION=3.9 + PYTHON_VERSION=3.10 ;; - pytorch-linux-jammy-cuda12.8-cudnn9-py3.9-linter) - PYTHON_VERSION=3.9 + pytorch-linux-jammy-cuda12.8-cudnn9-py3.10-linter) + PYTHON_VERSION=3.10 CUDA_VERSION=12.8.1 ;; pytorch-linux-jammy-aarch64-py3.10-gcc11) diff --git a/.ci/docker/centos-rocm/Dockerfile b/.ci/docker/centos-rocm/Dockerfile index 07788af580e3a..4fa4ca29886e6 100644 --- a/.ci/docker/centos-rocm/Dockerfile +++ b/.ci/docker/centos-rocm/Dockerfile @@ -59,9 +59,13 @@ ENV INSTALLED_VISION ${VISION} # Install rocm ARG ROCM_VERSION +RUN mkdir ci_commit_pins +COPY ./common/common_utils.sh common_utils.sh +COPY ./ci_commit_pins/rocm-composable-kernel.txt ci_commit_pins/rocm-composable-kernel.txt COPY ./common/install_rocm.sh install_rocm.sh RUN bash ./install_rocm.sh -RUN rm install_rocm.sh +RUN rm install_rocm.sh common_utils.sh +RUN rm -r ci_commit_pins COPY ./common/install_rocm_magma.sh install_rocm_magma.sh RUN bash ./install_rocm_magma.sh ${ROCM_VERSION} RUN rm install_rocm_magma.sh diff --git a/.ci/docker/ci_commit_pins/executorch.txt b/.ci/docker/ci_commit_pins/executorch.txt index 0e527f4682297..0a30a6037a05c 100644 --- a/.ci/docker/ci_commit_pins/executorch.txt +++ b/.ci/docker/ci_commit_pins/executorch.txt @@ -1 +1 @@ -56392aa978594cc155fa8af48cd949f5b5f1823a +e0dda9059d082537cee36be6c5e4fe3b18c880c0 diff --git a/.ci/docker/ci_commit_pins/huggingface-requirements.txt b/.ci/docker/ci_commit_pins/huggingface-requirements.txt index 66e5dbdfb1bb1..f4f3830136eb6 100644 --- a/.ci/docker/ci_commit_pins/huggingface-requirements.txt +++ b/.ci/docker/ci_commit_pins/huggingface-requirements.txt @@ -1,2 +1,2 @@ -transformers==4.54.0 +transformers==4.56.0 soxr==0.5.0 diff --git a/.ci/docker/ci_commit_pins/rocm-composable-kernel.txt b/.ci/docker/ci_commit_pins/rocm-composable-kernel.txt new file mode 100644 index 0000000000000..c45f46af95d03 --- /dev/null +++ b/.ci/docker/ci_commit_pins/rocm-composable-kernel.txt @@ -0,0 +1 @@ +7fe50dc3da2069d6645d9deb8c017a876472a977 diff --git a/.ci/docker/common/install_executorch.sh b/.ci/docker/common/install_executorch.sh index becd2264e3958..fb168acd4febe 100755 --- a/.ci/docker/common/install_executorch.sh +++ b/.ci/docker/common/install_executorch.sh @@ -42,22 +42,27 @@ install_pip_dependencies() { # A workaround, ExecuTorch has moved to numpy 2.0 which is not compatible with the current # numba and scipy version used in PyTorch CI conda_run pip uninstall -y numba scipy + # Yaspin is needed for running CI test (get_benchmark_analysis_data.py) + pip_install yaspin==3.1.0 popd } setup_executorch() { - pushd executorch - export PYTHON_EXECUTABLE=python - export CMAKE_ARGS="-DEXECUTORCH_BUILD_PYBIND=ON -DEXECUTORCH_BUILD_XNNPACK=ON -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON" + export CMAKE_ARGS="-DEXECUTORCH_BUILD_PYBIND=ON -DEXECUTORCH_BUILD_XNNPACK=ON -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON -DEXECUTORCH_BUILD_TESTS=ON" as_jenkins .ci/scripts/setup-linux.sh --build-tool cmake || true - popd } -clone_executorch -install_buck2 -install_conda_dependencies -install_pip_dependencies -setup_executorch +if [ $# -eq 0 ]; then + clone_executorch + install_buck2 + install_conda_dependencies + install_pip_dependencies + pushd executorch + setup_executorch + popd +else + "$@" +fi diff --git a/.ci/docker/common/install_rocm.sh b/.ci/docker/common/install_rocm.sh index 6d8918f79a0af..675a7a3437274 100644 --- a/.ci/docker/common/install_rocm.sh +++ b/.ci/docker/common/install_rocm.sh @@ -2,6 +2,11 @@ set -ex +# for pip_install function +source "$(dirname "${BASH_SOURCE[0]}")/common_utils.sh" + +ROCM_COMPOSABLE_KERNEL_VERSION="$(cat $(dirname $0)/../ci_commit_pins/rocm-composable-kernel.txt)" + ver() { printf "%3d%03d%03d%03d" $(echo "$1" | tr '.' ' '); } @@ -109,8 +114,7 @@ EOF rm -rf HIP clr fi - # temporary hipblasLT dependency install - apt install libmsgpackc2 + pip_install "git+https://github.com/rocm/composable_kernel@$ROCM_COMPOSABLE_KERNEL_VERSION" # Cleanup apt-get autoclean && apt-get clean @@ -122,8 +126,8 @@ install_centos() { yum update -y yum install -y kmod yum install -y wget - - if [[ $OS_VERSION == 9 ]]; then + + if [[ $OS_VERSION == 9 ]]; then dnf install -y openblas-serial dnf install -y dkms kernel-headers kernel-devel else @@ -195,6 +199,8 @@ install_centos() { sqlite3 $kdb "PRAGMA journal_mode=off; PRAGMA VACUUM;" done + pip_install "git+https://github.com/rocm/composable_kernel@$ROCM_COMPOSABLE_KERNEL_VERSION" + # Cleanup yum clean all rm -rf /var/cache/yum diff --git a/.ci/docker/requirements-ci.txt b/.ci/docker/requirements-ci.txt index 45fef66fd567f..08687a02530e9 100644 --- a/.ci/docker/requirements-ci.txt +++ b/.ci/docker/requirements-ci.txt @@ -93,8 +93,9 @@ librosa==0.10.2 ; python_version == "3.12" and platform_machine != "s390x" #Pinned versions: #test that import: -mypy==1.16.0 +mypy==1.16.0 ; platform_system != "Windows" # Pin MyPy version because new errors are likely to appear with each release +# Skip on Windows as lots of type annotations are POSIX specific #Description: linter #Pinned versions: 1.16.0 #test that import: test_typing.py, test_type_hints.py @@ -322,8 +323,6 @@ lxml==5.3.0 ; python_version <= "3.12" lxml==6.0.0 ; python_version == "3.13" #Description: This is a requirement of unittest-xml-reporting -# Python-3.9 binaries - PyGithub==2.3.0 sympy==1.13.3 diff --git a/.ci/docker/requirements-docs.txt b/.ci/docker/requirements-docs.txt index efe6fb4c949b0..c30ab3e993e94 100644 --- a/.ci/docker/requirements-docs.txt +++ b/.ci/docker/requirements-docs.txt @@ -1,7 +1,7 @@ sphinx==5.3.0 #Description: This is used to generate PyTorch docs #Pinned versions: 5.3.0 --e git+https://github.com/pytorch/pytorch_sphinx_theme.git@1657ad2fc1acdc98aa719eebecbb0128a7c13ce4#egg=pytorch_sphinx_theme2 +-e git+https://github.com/pytorch/pytorch_sphinx_theme.git@d53b0ffb9b1cda68260693ea98f3483823c88d8e#egg=pytorch_sphinx_theme2 # TODO: sphinxcontrib.katex 0.9.0 adds a local KaTeX server to speed up pre-rendering # but it doesn't seem to work and hangs around idly. The initial thought that it is probably diff --git a/.ci/docker/ubuntu-rocm/Dockerfile b/.ci/docker/ubuntu-rocm/Dockerfile index 681f6fe750510..b517a990a057b 100644 --- a/.ci/docker/ubuntu-rocm/Dockerfile +++ b/.ci/docker/ubuntu-rocm/Dockerfile @@ -52,9 +52,13 @@ ENV INSTALLED_VISION ${VISION} # Install rocm ARG ROCM_VERSION +RUN mkdir ci_commit_pins +COPY ./common/common_utils.sh common_utils.sh +COPY ./ci_commit_pins/rocm-composable-kernel.txt ci_commit_pins/rocm-composable-kernel.txt COPY ./common/install_rocm.sh install_rocm.sh RUN bash ./install_rocm.sh -RUN rm install_rocm.sh +RUN rm install_rocm.sh common_utils.sh +RUN rm -r ci_commit_pins COPY ./common/install_rocm_magma.sh install_rocm_magma.sh RUN bash ./install_rocm_magma.sh ${ROCM_VERSION} RUN rm install_rocm_magma.sh diff --git a/.ci/libtorch/build.sh b/.ci/libtorch/build.sh index 54ddd905aad05..c2d67f8b1bb29 100644 --- a/.ci/libtorch/build.sh +++ b/.ci/libtorch/build.sh @@ -7,4 +7,4 @@ set -ex SCRIPTPATH="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" -USE_NVSHMEM=0 USE_CUSPARSELT=0 BUILD_PYTHONLESS=1 DESIRED_PYTHON="3.9" ${SCRIPTPATH}/../manywheel/build.sh +USE_NVSHMEM=0 USE_CUSPARSELT=0 BUILD_PYTHONLESS=1 DESIRED_PYTHON="3.10" ${SCRIPTPATH}/../manywheel/build.sh diff --git a/.ci/lumen_cli/cli/lib/core/vllm/lib.py b/.ci/lumen_cli/cli/lib/core/vllm/lib.py index 98cfc807e284a..0e2132839adbb 100644 --- a/.ci/lumen_cli/cli/lib/core/vllm/lib.py +++ b/.ci/lumen_cli/cli/lib/core/vllm/lib.py @@ -41,7 +41,6 @@ def sample_vllm_test_library(): "pytest -v -s basic_correctness/test_cumem.py", "pytest -v -s basic_correctness/test_basic_correctness.py", "pytest -v -s basic_correctness/test_cpu_offload.py", - "VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py", ], }, "vllm_basic_models_test": { @@ -68,15 +67,12 @@ def sample_vllm_test_library(): "-v", "-s", "entrypoints/llm", - "--ignore=entrypoints/llm/test_lazy_outlines.py", "--ignore=entrypoints/llm/test_generate.py", - "--ignore=entrypoints/llm/test_generate_multiple_loras.py", "--ignore=entrypoints/llm/test_collective_rpc.py", ] ), - "pytest -v -s entrypoints/llm/test_lazy_outlines.py", - "pytest -v -s entrypoints/llm/test_generate.py ", - "VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode", + "pytest -v -s entrypoints/llm/test_generate.py", + "pytest -v -s entrypoints/offline_mode", ], }, "vllm_regression_test": { diff --git a/.ci/lumen_cli/cli/lib/core/vllm/vllm_build.py b/.ci/lumen_cli/cli/lib/core/vllm/vllm_build.py index 8db48065cb052..415e05d07551b 100644 --- a/.ci/lumen_cli/cli/lib/core/vllm/vllm_build.py +++ b/.ci/lumen_cli/cli/lib/core/vllm/vllm_build.py @@ -66,6 +66,11 @@ class VllmBuildParameters: "DOCKERFILE_PATH", ".github/ci_configs/vllm/Dockerfile.tmp_vllm" ) + # the cleaning script to remove torch dependencies from pip + cleaning_script: Path = env_path_field( + "cleaning_script", ".github/ci_configs/vllm/use_existing_torch.py" + ) + # OUTPUT_DIR: where docker buildx (local exporter) will write artifacts output_dir: Path = env_path_field("OUTPUT_DIR", "external/vllm") @@ -160,6 +165,7 @@ def run(self): logger.info("Running vllm build with inputs: %s", inputs) vllm_commit = clone_vllm() + self.cp_torch_cleaning_script(inputs) self.cp_dockerfile_if_exist(inputs) # cp torch wheels from root direct to vllm workspace if exist self.cp_torch_whls_if_exist(inputs) @@ -205,6 +211,11 @@ def cp_torch_whls_if_exist(self, inputs: VllmBuildParameters) -> str: copy(inputs.torch_whls_path, tmp_dir) return tmp_dir + def cp_torch_cleaning_script(self, inputs: VllmBuildParameters): + script = get_path(inputs.cleaning_script, resolve=True) + vllm_script = Path(f"./{self.work_directory}/use_existing_torch.py") + copy(script, vllm_script) + def cp_dockerfile_if_exist(self, inputs: VllmBuildParameters): if not inputs.use_local_dockerfile: logger.info("using vllm default dockerfile.torch_nightly for build") diff --git a/.ci/lumen_cli/cli/lib/core/vllm/vllm_test.py b/.ci/lumen_cli/cli/lib/core/vllm/vllm_test.py index 76401e33f29fd..224f078788702 100644 --- a/.ci/lumen_cli/cli/lib/core/vllm/vllm_test.py +++ b/.ci/lumen_cli/cli/lib/core/vllm/vllm_test.py @@ -11,7 +11,7 @@ from cli.lib.common.cli_helper import BaseRunner from cli.lib.common.envs_helper import env_path_field, env_str_field, get_env -from cli.lib.common.path_helper import copy, remove_dir +from cli.lib.common.path_helper import copy, get_path, remove_dir from cli.lib.common.pip_helper import ( pip_install_first_match, pip_install_packages, @@ -43,6 +43,10 @@ class VllmTestParameters: torch_cuda_arch_list: str = env_str_field("TORCH_CUDA_ARCH_LIST", "8.9") + cleaning_script: Path = env_path_field( + "cleaning_script", ".github/ci_configs/vllm/use_existing_torch.py" + ) + def __post_init__(self): if not self.torch_whls_path.exists(): raise ValueError("missing torch_whls_path") @@ -92,11 +96,13 @@ def prepare(self): self._set_envs(params) clone_vllm(dst=self.work_directory) + self.cp_torch_cleaning_script(params) with working_directory(self.work_directory): remove_dir(Path("vllm")) self._install_wheels(params) self._install_dependencies() # verify the torches are not overridden by test dependencies + check_versions() def run(self): @@ -125,6 +131,11 @@ def run(self): # double check the torches are not overridden by other packages check_versions() + def cp_torch_cleaning_script(self, params: VllmTestParameters): + script = get_path(params.cleaning_script, resolve=True) + vllm_script = Path(f"./{self.work_directory}/use_existing_torch.py") + copy(script, vllm_script) + def _install_wheels(self, params: VllmTestParameters): logger.info("Running vllm test with inputs: %s", params) if not pkg_exists("torch"): diff --git a/.ci/pytorch/common_utils.sh b/.ci/pytorch/common_utils.sh index edfff60744919..9c9d223777466 100644 --- a/.ci/pytorch/common_utils.sh +++ b/.ci/pytorch/common_utils.sh @@ -258,11 +258,19 @@ function install_torchrec_and_fbgemm() { git clone --recursive https://github.com/pytorch/fbgemm pushd fbgemm/fbgemm_gpu git checkout "${fbgemm_commit}" --recurse-submodules - python setup.py bdist_wheel \ - --build-variant=rocm \ - -DHIP_ROOT_DIR="${ROCM_PATH}" \ - -DCMAKE_C_FLAGS="-DTORCH_USE_HIP_DSA" \ - -DCMAKE_CXX_FLAGS="-DTORCH_USE_HIP_DSA" + # until the fbgemm_commit includes the tbb patch + patch <<'EOF' +--- a/FbgemmGpu.cmake ++++ b/FbgemmGpu.cmake +@@ -184,5 +184,6 @@ gpu_cpp_library( + fbgemm_gpu_tbe_cache + fbgemm_gpu_tbe_optimizers + fbgemm_gpu_tbe_utils ++ tbb + DESTINATION + fbgemm_gpu) +EOF + python setup.py bdist_wheel --build-variant=rocm popd # Save the wheel before cleaning up diff --git a/.ci/pytorch/functorch_doc_push_script.sh b/.ci/pytorch/functorch_doc_push_script.sh deleted file mode 100755 index 85c70dffa3966..0000000000000 --- a/.ci/pytorch/functorch_doc_push_script.sh +++ /dev/null @@ -1,40 +0,0 @@ -#!/bin/bash - -# This is where the local pytorch install in the docker image is located -pt_checkout="/var/lib/jenkins/workspace" -source "$pt_checkout/.ci/pytorch/common_utils.sh" -echo "functorch_doc_push_script.sh: Invoked with $*" - -set -ex -o pipefail - -version=${DOCS_VERSION:-nightly} -echo "version: $version" - -# Build functorch docs -pushd $pt_checkout/functorch/docs -make html -popd - -git clone https://github.com/pytorch/functorch -b gh-pages --depth 1 functorch_ghpages -pushd functorch_ghpages - -if [ "$version" == "main" ]; then - version=nightly -fi - -git rm -rf "$version" || true -mv "$pt_checkout/functorch/docs/build/html" "$version" - -git add "$version" || true -git status -git config user.email "soumith+bot@pytorch.org" -git config user.name "pytorchbot" -# If there aren't changes, don't make a commit; push is no-op -git commit -m "Generate Python docs from pytorch/pytorch@${GITHUB_SHA}" || true -git status - -if [[ "${WITH_PUSH:-}" == true ]]; then - git push -u origin gh-pages -fi - -popd diff --git a/.ci/pytorch/macos-test.sh b/.ci/pytorch/macos-test.sh index 79d47da431712..c1505bd58cdde 100755 --- a/.ci/pytorch/macos-test.sh +++ b/.ci/pytorch/macos-test.sh @@ -59,7 +59,7 @@ test_python_shard() { setup_test_python - time python test/run_test.py --verbose --exclude-jit-executor --exclude-distributed-tests --shard "$1" "$NUM_TEST_SHARDS" + time python test/run_test.py --verbose --exclude-jit-executor --exclude-distributed-tests --exclude-quantization-tests --shard "$1" "$NUM_TEST_SHARDS" assert_git_not_dirty } diff --git a/.ci/pytorch/numba-cuda-13.patch b/.ci/pytorch/numba-cuda-13.patch new file mode 100644 index 0000000000000..f96ff287ed396 --- /dev/null +++ b/.ci/pytorch/numba-cuda-13.patch @@ -0,0 +1,25 @@ +From 6e08c9d08e9de59c7af28b720289debbbd384764 Mon Sep 17 00:00:00 2001 +From: Michael Wang <13521008+isVoid@users.noreply.github.com> +Date: Tue, 1 Apr 2025 17:28:05 -0700 +Subject: [PATCH] Avoid bumping certain driver API to avoid future breakage + (#185) + +Co-authored-by: isVoid +--- + numba_cuda/numba/cuda/cudadrv/driver.py | 3 +++ + 1 file changed, 3 insertions(+) + +diff --git a/numba_cuda/numba/cuda/cudadrv/driver.py b/numba_cuda/numba/cuda/cudadrv/driver.py +index 1641bf77..233e9ed7 100644 +--- a/numba_cuda/numba/cuda/cudadrv/driver.py ++++ b/numba_cuda/numba/cuda/cudadrv/driver.py +@@ -365,6 +365,9 @@ def _find_api(self, fname): + else: + variants = ('_v2', '') + ++ if fname in ("cuCtxGetDevice", "cuCtxSynchronize"): ++ return getattr(self.lib, fname) ++ + for variant in variants: + try: + return getattr(self.lib, f'{fname}{variant}') diff --git a/.ci/pytorch/smoke_test/smoke_test.py b/.ci/pytorch/smoke_test/smoke_test.py index 305ad15d98e7e..675d58a3e283d 100644 --- a/.ci/pytorch/smoke_test/smoke_test.py +++ b/.ci/pytorch/smoke_test/smoke_test.py @@ -386,8 +386,8 @@ def foo(x: torch.Tensor) -> torch.Tensor: def smoke_test_nvshmem() -> None: - if not torch.cuda.is_available(): - print("CUDA is not available, skipping NVSHMEM test") + if not torch.cuda.is_available() or target_os == "windows": + print("Windows platform or CUDA is not available, skipping NVSHMEM test") return # Check if NVSHMEM is compiled in current build @@ -396,7 +396,9 @@ def smoke_test_nvshmem() -> None: except ImportError: # Not built with NVSHMEM support. # torch is not compiled with NVSHMEM prior to 2.9 - if torch.__version__ < "2.9": + from torch.torch_version import TorchVersion + + if TorchVersion(torch.__version__) < (2, 9): return else: # After 2.9: NVSHMEM is expected to be compiled in current build diff --git a/.ci/pytorch/test.sh b/.ci/pytorch/test.sh index e0d47259676b7..7267541483438 100755 --- a/.ci/pytorch/test.sh +++ b/.ci/pytorch/test.sh @@ -32,6 +32,16 @@ if [[ "$BUILD_ENVIRONMENT" != *rocm* && "$BUILD_ENVIRONMENT" != *s390x* && -d /v git config --global --add safe.directory /var/lib/jenkins/workspace fi + +# Patch numba to avoid CUDA-13 crash, see https://github.com/pytorch/pytorch/issues/162878 +NUMBA_CUDA_DIR=$(python -c "import os;import numba.cuda; print(os.path.dirname(numba.cuda.__file__))" 2>/dev/null || true) +if [ -n "$NUMBA_CUDA_DIR" ]; then + NUMBA_PATCH="$(dirname "$(realpath "${BASH_SOURCE[0]}")")/numba-cuda-13.patch" + pushd "$NUMBA_CUDA_DIR" + patch -p4 <"$NUMBA_PATCH" + popd +fi + echo "Environment variables:" env @@ -312,23 +322,29 @@ test_python_shard() { # modify LD_LIBRARY_PATH to ensure it has the conda env. # This set of tests has been shown to be buggy without it for the split-build - time python test/run_test.py --exclude-jit-executor --exclude-distributed-tests $INCLUDE_CLAUSE --shard "$1" "$NUM_TEST_SHARDS" --verbose $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running + time python test/run_test.py --exclude-jit-executor --exclude-distributed-tests --exclude-quantization-tests $INCLUDE_CLAUSE --shard "$1" "$NUM_TEST_SHARDS" --verbose $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running assert_git_not_dirty } test_python() { # shellcheck disable=SC2086 - time python test/run_test.py --exclude-jit-executor --exclude-distributed-tests $INCLUDE_CLAUSE --verbose $PYTHON_TEST_EXTRA_OPTION + time python test/run_test.py --exclude-jit-executor --exclude-distributed-tests --exclude-quantization-tests $INCLUDE_CLAUSE --verbose $PYTHON_TEST_EXTRA_OPTION assert_git_not_dirty } test_python_smoke() { - # Smoke tests for H100 + # Smoke tests for H100/B200 time python test/run_test.py --include test_matmul_cuda inductor/test_fp8 inductor/test_max_autotune $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running assert_git_not_dirty } +test_python_smoke_b200() { + # Targeted smoke tests for B200 - staged approach to avoid too many failures + time python test/run_test.py --include test_matmul_cuda inductor/test_fp8 $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running + assert_git_not_dirty +} + test_h100_distributed() { # Distributed tests at H100 time python test/run_test.py --include distributed/_composable/test_composability/test_pp_composability.py $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running @@ -374,6 +390,7 @@ test_dynamo_wrapped_shard() { --exclude-distributed-tests \ --exclude-torch-export-tests \ --exclude-aot-dispatch-tests \ + --exclude-quantization-tests \ --shard "$1" "$NUM_TEST_SHARDS" \ --verbose \ --upload-artifacts-while-running @@ -1146,6 +1163,12 @@ test_distributed() { fi } +test_quantization() { + echo "Testing quantization" + + python test/test_quantization.py +} + test_rpc() { echo "Testing RPC C++ tests" # NB: the ending test_rpc must match the current function name for the current @@ -1540,14 +1563,10 @@ test_executorch() { install_torchvision install_torchaudio - pushd /executorch - - export PYTHON_EXECUTABLE=python - export CMAKE_ARGS="-DEXECUTORCH_BUILD_PYBIND=ON -DEXECUTORCH_BUILD_XNNPACK=ON -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON" + INSTALL_SCRIPT="$(pwd)/.ci/docker/common/install_executorch.sh" - # NB: We need to rebuild ExecuTorch runner here because it depends on PyTorch - # from the PR - bash .ci/scripts/setup-linux.sh --build-tool cmake + pushd /executorch + "${INSTALL_SCRIPT}" setup_executorch echo "Run ExecuTorch unit tests" pytest -v -n auto @@ -1561,17 +1580,14 @@ test_executorch() { popd - # Test torchgen generated code for Executorch. - echo "Testing ExecuTorch op registration" - "$BUILD_BIN_DIR"/test_edge_op_registration - assert_git_not_dirty } test_linux_aarch64() { python test/run_test.py --include test_modules test_mkldnn test_mkldnn_fusion test_openmp test_torch test_dynamic_shapes \ test_transformers test_multiprocessing test_numpy_interop test_autograd test_binary_ufuncs test_complex test_spectral_ops \ - test_foreach test_reductions test_unary_ufuncs test_tensor_creation_ops test_ops \ + test_foreach test_reductions test_unary_ufuncs test_tensor_creation_ops test_ops profiler/test_memory_profiler \ + distributed/elastic/timer/api_test distributed/elastic/timer/local_timer_example distributed/elastic/timer/local_timer_test \ --shard "$SHARD_NUMBER" "$NUM_TEST_SHARDS" --verbose # Dynamo tests @@ -1646,6 +1662,8 @@ elif [[ "${TEST_CONFIG}" == *executorch* ]]; then test_executorch elif [[ "$TEST_CONFIG" == 'jit_legacy' ]]; then test_python_legacy_jit +elif [[ "$TEST_CONFIG" == 'quantization' ]]; then + test_quantization elif [[ "${BUILD_ENVIRONMENT}" == *libtorch* ]]; then # TODO: run some C++ tests echo "no-op at the moment" @@ -1721,11 +1739,6 @@ elif [[ "${TEST_CONFIG}" == *inductor_cpp_wrapper* ]]; then elif [[ "${TEST_CONFIG}" == *inductor* ]]; then install_torchvision test_inductor_shard "${SHARD_NUMBER}" - if [[ "${SHARD_NUMBER}" == 1 ]]; then - if [[ "${BUILD_ENVIRONMENT}" != linux-jammy-py3.9-gcc11-build ]]; then - test_inductor_distributed - fi - fi elif [[ "${TEST_CONFIG}" == *einops* ]]; then test_einops elif [[ "${TEST_CONFIG}" == *dynamo_wrapped* ]]; then @@ -1775,6 +1788,8 @@ elif [[ "${BUILD_ENVIRONMENT}" == *xpu* ]]; then test_xpu_bin elif [[ "${TEST_CONFIG}" == smoke ]]; then test_python_smoke +elif [[ "${TEST_CONFIG}" == smoke_b200 ]]; then + test_python_smoke_b200 elif [[ "${TEST_CONFIG}" == h100_distributed ]]; then test_h100_distributed elif [[ "${TEST_CONFIG}" == "h100-symm-mem" ]]; then diff --git a/.ci/pytorch/win-test-helpers/build_pytorch.bat b/.ci/pytorch/win-test-helpers/build_pytorch.bat index 19d715b9d0b6d..67d1569221924 100644 --- a/.ci/pytorch/win-test-helpers/build_pytorch.bat +++ b/.ci/pytorch/win-test-helpers/build_pytorch.bat @@ -137,7 +137,7 @@ sccache --show-stats python -c "import os, glob; os.system('python -mpip install --no-index --no-deps ' + glob.glob('dist/*.whl')[0])" ( if "%BUILD_ENVIRONMENT%"=="" ( - echo NOTE: To run `import torch`, please make sure to activate the conda environment by running `call %CONDA_PARENT_DIR%\Miniconda3\Scripts\activate.bat %CONDA_PARENT_DIR%\Miniconda3` in Command Prompt before running Git Bash. + echo NOTE: To run `import torch`, please make sure to activate the conda environment by running `call %CONDA_ROOT_DIR%\Scripts\activate.bat %CONDA_ROOT_DIR%\envs\py_tmp` in Command Prompt before running Git Bash. ) else ( copy /Y "dist\*.whl" "%PYTORCH_FINAL_PACKAGE_DIR%" diff --git a/.ci/pytorch/win-test-helpers/installation-helpers/activate_miniconda3.bat b/.ci/pytorch/win-test-helpers/installation-helpers/activate_miniconda3.bat index 01e08c8bb4e5c..abd2c8722b11d 100644 --- a/.ci/pytorch/win-test-helpers/installation-helpers/activate_miniconda3.bat +++ b/.ci/pytorch/win-test-helpers/installation-helpers/activate_miniconda3.bat @@ -3,12 +3,12 @@ if "%BUILD_ENVIRONMENT%"=="" ( ) else ( set CONDA_PARENT_DIR=C:\Jenkins ) - +set CONDA_ROOT_DIR=%CONDA_PARENT_DIR%\Miniconda3 :: Be conservative here when rolling out the new AMI with conda. This will try :: to install conda as before if it couldn't find the conda installation. This :: can be removed eventually after we gain enough confidence in the AMI -if not exist %CONDA_PARENT_DIR%\Miniconda3 ( +if not exist %CONDA_ROOT_DIR% ( set INSTALL_FRESH_CONDA=1 ) @@ -17,10 +17,14 @@ if "%INSTALL_FRESH_CONDA%"=="1" ( if errorlevel 1 exit /b if not errorlevel 0 exit /b - %TMP_DIR_WIN%\Miniconda3-latest-Windows-x86_64.exe /InstallationType=JustMe /RegisterPython=0 /S /AddToPath=0 /D=%CONDA_PARENT_DIR%\Miniconda3 + %TMP_DIR_WIN%\Miniconda3-latest-Windows-x86_64.exe /InstallationType=JustMe /RegisterPython=0 /S /AddToPath=0 /D=%CONDA_ROOT_DIR% if errorlevel 1 exit /b if not errorlevel 0 exit /b ) :: Activate conda so that we can use its commands, i.e. conda, python, pip -call %CONDA_PARENT_DIR%\Miniconda3\Scripts\activate.bat %CONDA_PARENT_DIR%\Miniconda3 +call %CONDA_ROOT_DIR%\Scripts\activate.bat %CONDA_ROOT_DIR% +:: Activate conda so that we can use its commands, i.e. conda, python, pip +call conda activate py_tmp + +call pip install -r .ci/docker/requirements-ci.txt diff --git a/.ci/pytorch/win-test-helpers/setup_pytorch_env.bat b/.ci/pytorch/win-test-helpers/setup_pytorch_env.bat index 4a464d6b5786a..3173582b06f45 100644 --- a/.ci/pytorch/win-test-helpers/setup_pytorch_env.bat +++ b/.ci/pytorch/win-test-helpers/setup_pytorch_env.bat @@ -14,7 +14,7 @@ if not errorlevel 0 exit /b :: build\torch. Rather than changing all these references, making a copy of torch folder :: from conda to the current workspace is easier. The workspace will be cleaned up after :: the job anyway -xcopy /s %CONDA_PARENT_DIR%\Miniconda3\Lib\site-packages\torch %TMP_DIR_WIN%\build\torch\ +xcopy /s %CONDA_ROOT_DIR%\envs\py_tmp\Lib\site-packages\torch %TMP_DIR_WIN%\build\torch\ pushd . if "%VC_VERSION%" == "" ( diff --git a/.ci/pytorch/win-test-helpers/test_python_shard.bat b/.ci/pytorch/win-test-helpers/test_python_shard.bat index d0fa3babe59d5..02829ee369757 100644 --- a/.ci/pytorch/win-test-helpers/test_python_shard.bat +++ b/.ci/pytorch/win-test-helpers/test_python_shard.bat @@ -25,7 +25,7 @@ echo Copying over test times file robocopy /E "%PYTORCH_FINAL_PACKAGE_DIR_WIN%\.additional_ci_files" "%PROJECT_DIR_WIN%\.additional_ci_files" echo Run nn tests -python run_test.py --exclude-jit-executor --exclude-distributed-tests --shard "%SHARD_NUMBER%" "%NUM_TEST_SHARDS%" --verbose +python run_test.py --exclude-jit-executor --exclude-distributed-tests --exclude-quantization-tests --shard "%SHARD_NUMBER%" "%NUM_TEST_SHARDS%" --verbose if ERRORLEVEL 1 goto fail popd diff --git a/.ci/pytorch/win-test.sh b/.ci/pytorch/win-test.sh index 43524dc04e3fb..c96d5c331c9f8 100755 --- a/.ci/pytorch/win-test.sh +++ b/.ci/pytorch/win-test.sh @@ -38,7 +38,14 @@ if [[ "$BUILD_ENVIRONMENT" == *cuda* ]]; then fi # TODO: Move both of them to Windows AMI -python -m pip install pytest-rerunfailures==10.3 pytest-cpp==2.3.0 tensorboard==2.13.0 protobuf==5.29.4 pytest-subtests==0.13.1 +python -m pip install tensorboard==2.13.0 protobuf==5.29.4 pytest-subtests==0.13.1 + +# Copied from https://github.com/pytorch/test-infra/blob/be01a40157c36cd5a48391fdf44a7bc3ebd4c7e3/aws/ami/windows/scripts/Installers/Install-Pip-Dependencies.ps1#L16 with some adjustments +# pytest-rerunfailures==10.3 as 10.2 fails with INTERNALERROR> pluggy._manager.PluginValidationError: unknown hook 'pytest_configure_node' +# scipy from 1.6.3 to 1.10 +# expecttest from 0.1.3 to 0.3.0 +# xdoctest from 1.0.2 to 1.3.0 +python -m pip install "future==0.18.2" "hypothesis==5.35.1" "expecttest==0.3.0" "librosa>=0.6.2" "scipy==1.10.1" "psutil==5.9.1" "pynvml==11.4.1" "pillow==9.2.0" "unittest-xml-reporting<=3.2.0,>=2.0.0" "pytest==7.1.3" "pytest-xdist==2.5.0" "pytest-flakefinder==1.1.0" "pytest-rerunfailures==10.3" "pytest-shard==0.1.2" "sympy==1.11.1" "xdoctest==1.3.0" "pygments==2.12.0" "opt-einsum>=3.3" "networkx==2.8.8" "mpmath==1.2.1" "pytest-cpp==2.3.0" "boto3==1.35.42" # Install Z3 optional dependency for Windows builds. python -m pip install z3-solver==4.15.1.0 @@ -52,9 +59,6 @@ python -m pip install parameterized==0.8.1 # Install pulp for testing ilps under torch\distributed\_tools python -m pip install pulp==2.9.0 -# Install expecttest to merge https://github.com/pytorch/pytorch/pull/155308 -python -m pip install expecttest==0.3.0 - run_tests() { # Run nvidia-smi if available for path in '/c/Program Files/NVIDIA Corporation/NVSMI/nvidia-smi.exe' /c/Windows/System32/nvidia-smi.exe; do diff --git a/.ci/wheel/build_wheel.sh b/.ci/wheel/build_wheel.sh index 763fce4b73e18..98b50c0ceeafe 100755 --- a/.ci/wheel/build_wheel.sh +++ b/.ci/wheel/build_wheel.sh @@ -85,7 +85,7 @@ mkdir -p "$PYTORCH_FINAL_PACKAGE_DIR" || true # Create an isolated directory to store this builds pytorch checkout and conda # installation if [[ -z "$MAC_PACKAGE_WORK_DIR" ]]; then - MAC_PACKAGE_WORK_DIR="$(pwd)/tmp_wheel_conda_${DESIRED_PYTHON}_$(date +%H%M%S)" + MAC_PACKAGE_WORK_DIR="$(pwd)/tmp_wheel_${DESIRED_PYTHON}_$(date +%H%M%S)" fi mkdir -p "$MAC_PACKAGE_WORK_DIR" || true if [[ -n ${GITHUB_ACTIONS} ]]; then @@ -96,11 +96,11 @@ fi whl_tmp_dir="${MAC_PACKAGE_WORK_DIR}/dist" mkdir -p "$whl_tmp_dir" -mac_version='macosx_11_0_arm64' +mac_version='macosx-11_0-arm64' libtorch_arch='arm64' # Create a consistent wheel package name to rename the wheel to -wheel_filename_new="${TORCH_PACKAGE_NAME}-${build_version}${build_number_prefix}-cp${python_nodot}-none-${mac_version}.whl" +wheel_filename_new="${TORCH_PACKAGE_NAME}-${build_version}${build_number_prefix}-cp${python_nodot}-none-${mac_version//[-,]/_}.whl" ########################################################### @@ -125,7 +125,6 @@ popd export TH_BINARY_BUILD=1 export INSTALL_TEST=0 # dont install test binaries into site-packages export MACOSX_DEPLOYMENT_TARGET=11.0 -export CMAKE_PREFIX_PATH=${CONDA_PREFIX:-"$(dirname $(which conda))/../"} EXTRA_CONDA_INSTALL_FLAGS="" CONDA_ENV_CREATE_FLAGS="" @@ -133,25 +132,19 @@ RENAME_WHEEL=true case $desired_python in 3.14t) echo "Using 3.14 deps" + mac_version='macosx-11.0-arm64' NUMPY_PINNED_VERSION="==2.1.0" - CONDA_ENV_CREATE_FLAGS="python-freethreading" - EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge" - desired_python="3.14.0rc1" RENAME_WHEEL=false ;; 3.14) echo "Using 3.14t deps" + mac_version='macosx-11.0-arm64' NUMPY_PINNED_VERSION="==2.1.0" - EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge" - desired_python="3.14.0rc1" RENAME_WHEEL=false ;; 3.13t) echo "Using 3.13 deps" NUMPY_PINNED_VERSION="==2.1.0" - CONDA_ENV_CREATE_FLAGS="python-freethreading" - EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge" - desired_python="3.13" RENAME_WHEEL=false ;; 3.13) @@ -176,17 +169,12 @@ case $desired_python in ;; esac -# Install into a fresh env -tmp_env_name="wheel_py$python_nodot" -conda create ${EXTRA_CONDA_INSTALL_FLAGS} -yn "$tmp_env_name" python="$desired_python" ${CONDA_ENV_CREATE_FLAGS} -source activate "$tmp_env_name" - PINNED_PACKAGES=( "numpy${NUMPY_PINNED_VERSION}" ) -retry pip install "${PINNED_PACKAGES[@]}" -r "${pytorch_rootdir}/requirements-build.txt" -pip install requests ninja typing-extensions -retry pip install -r "${pytorch_rootdir}/requirements.txt" || true +python -mvenv ~/${desired_python}-build +source ~/${desired_python}-build/bin/activate +retry pip install "${PINNED_PACKAGES[@]}" -r "${pytorch_rootdir}/requirements.txt" retry brew install libomp # For USE_DISTRIBUTED=1 on macOS, this enables gloo, which needs libuv, which @@ -200,7 +188,7 @@ export BUILD_TEST=OFF pushd "$pytorch_rootdir" echo "Calling setup.py bdist_wheel at $(date)" -python setup.py bdist_wheel -d "$whl_tmp_dir" --plat-name ${mac_version} +_PYTHON_HOST_PLATFORM=${mac_version} ARCHFLAGS="-arch arm64" python setup.py bdist_wheel -d "$whl_tmp_dir" --plat-name "${mac_version//[-.]/_}" echo "Finished setup.py bdist_wheel at $(date)" diff --git a/.flake8 b/.flake8 index fc9ab167fbeef..fa73b7b880fd3 100644 --- a/.flake8 +++ b/.flake8 @@ -73,7 +73,7 @@ exclude = ./docs/src, ./functorch/docs, ./functorch/examples, - ./functorch/notebooks, + ./functorch/docs/source/tutorials, ./scripts, ./test/generated_type_hints_smoketest.py, ./third_party, diff --git a/.github/actionlint.yaml b/.github/actionlint.yaml index 798dee312306d..d4a7df9d5805b 100644 --- a/.github/actionlint.yaml +++ b/.github/actionlint.yaml @@ -21,6 +21,7 @@ self-hosted-runner: - linux.arm64.2xlarge.ephemeral - linux.arm64.m7g.4xlarge - linux.arm64.m7g.4xlarge.ephemeral + - linux.arm64.r7g.12xlarge.memory - linux.4xlarge.nvidia.gpu - linux.8xlarge.nvidia.gpu - linux.16xlarge.nvidia.gpu diff --git a/.github/actions/reuse-old-whl/reuse_old_whl.py b/.github/actions/reuse-old-whl/reuse_old_whl.py index def0276a9c8a3..48a8490985946 100644 --- a/.github/actions/reuse-old-whl/reuse_old_whl.py +++ b/.github/actions/reuse-old-whl/reuse_old_whl.py @@ -264,7 +264,7 @@ def change_content_to_new_version(file: Union[str, Path]) -> None: change_content_to_new_version(f"artifacts/dist/{old_stem}/torch/version.py") for file in Path(f"artifacts/dist/{old_stem}").glob( - "*.dist-info/**", + "*.dist-info/*", ): change_content_to_new_version(file) diff --git a/.github/actions/setup-win/action.yml b/.github/actions/setup-win/action.yml index 93c957896b5e8..37cec0c571538 100644 --- a/.github/actions/setup-win/action.yml +++ b/.github/actions/setup-win/action.yml @@ -6,6 +6,12 @@ inputs: cuda-version: description: which cuda version to install, 'cpu' for none required: true + python-version: + required: false + type: string + default: "3.10" + description: | + The python version to be used. Will be 3.10 by default runs: using: composite @@ -38,18 +44,24 @@ runs: CONDA="C:\Jenkins\Miniconda3\condabin\conda.bat" { + echo "CONDA=${CONDA}"; echo "CONDA_RUN=${CONDA} run --no-capture-output"; echo "CONDA_BUILD=${CONDA} run conda-build"; echo "CONDA_INSTALL=${CONDA} install"; } >> "${GITHUB_ENV}" - name: Setup Python3 + env: + PYTHON_VERSION: ${{ inputs.python-version }} shell: bash run: | set +e set -x - PYTHON3=$(${CONDA_RUN} which python3) + # Create new py_tmp env with python-version + ${CONDA} create -y -n py_tmp python=${PYTHON_VERSION} intel-openmp + + PYTHON3=$(${CONDA_RUN} -n py_tmp which python3) EXIT_CODE=$? if [[ "${EXIT_CODE}" == "0" ]]; then @@ -62,7 +74,7 @@ runs: # installation, which is Python 3 based. Its Python is default to Python 3. Further, there # is also the Miniconda installation that is Python 2 based, and both can be installed if # needed. In both cases, Python binary is just called python - PYTHON=$(${CONDA_RUN} which python) + PYTHON=$(${CONDA_RUN} -n py_tmp which python) EXIT_CODE=$? if [[ "${EXIT_CODE}" == "0" ]]; then diff --git a/.github/ci_commit_pins/audio.txt b/.github/ci_commit_pins/audio.txt index b0255e764c594..05e0b684b4278 100644 --- a/.github/ci_commit_pins/audio.txt +++ b/.github/ci_commit_pins/audio.txt @@ -1 +1 @@ -27fc2493d383354a008106f22f3be232badee9a1 +87ff22e49ed0e92576c4935ccb8c143daac4a3cd diff --git a/.github/ci_commit_pins/vllm.txt b/.github/ci_commit_pins/vllm.txt index c9c4265b2f37f..512b7c7da00e2 100644 --- a/.github/ci_commit_pins/vllm.txt +++ b/.github/ci_commit_pins/vllm.txt @@ -1 +1 @@ -e10fef08838612b4560e9c72e5cb1414a5edfa13 +1983609239caaab24ab1ed2bfa2aa92e8c76c1b1 diff --git a/.github/ci_commit_pins/xla.txt b/.github/ci_commit_pins/xla.txt index eb335eb9d64d5..504d924ec7641 100644 --- a/.github/ci_commit_pins/xla.txt +++ b/.github/ci_commit_pins/xla.txt @@ -1 +1 @@ -6c5478ff7c3d50dd1e3047d72ec5909bea474073 +c77852e117bdf056c8e9a087e51d6f65cf6ba53d diff --git a/.github/ci_configs/vllm/Dockerfile.tmp_vllm b/.github/ci_configs/vllm/Dockerfile.tmp_vllm index 2cee6ed2df19a..a1b68ad28210d 100644 --- a/.github/ci_configs/vllm/Dockerfile.tmp_vllm +++ b/.github/ci_configs/vllm/Dockerfile.tmp_vllm @@ -82,16 +82,10 @@ RUN if command -v apt-get >/dev/null; then \ apt-get update -y \ && apt-get install -y ccache software-properties-common git curl wget sudo vim; \ else \ - dnf install -y git curl wget sudo vim; \ + dnf install -y git curl wget sudo; \ fi \ && python3 --version && python3 -m pip --version -# Workaround for https://github.com/openai/triton/issues/2507 and -# https://github.com/pytorch/pytorch/issues/107960 -- hopefully -# this won't be needed for future versions of this docker image -# or future versions of triton. -RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/ - # Install uv for faster pip installs if not existed RUN --mount=type=cache,target=/root/.cache/uv \ if ! python3 -m uv --version >/dev/null 2>&1; then \ @@ -220,11 +214,16 @@ ARG SCCACHE_S3_NO_CREDENTIALS=0 RUN --mount=type=cache,target=/root/.cache/uv \ --mount=type=bind,source=.git,target=.git \ if [ "$USE_SCCACHE" = "1" ]; then \ - echo "Installing sccache..." \ - && curl -L -o sccache.tar.gz https://github.com/mozilla/sccache/releases/download/v0.8.1/sccache-v0.8.1-x86_64-unknown-linux-musl.tar.gz \ + echo "Installing sccache..."; \ + if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \ + SCCACHE_ARCHIVE="sccache-v0.8.1-aarch64-unknown-linux-musl"; \ + else \ + SCCACHE_ARCHIVE="sccache-v0.8.1-x86_64-unknown-linux-musl"; \ + fi; \ + curl -L -o sccache.tar.gz "https://github.com/mozilla/sccache/releases/download/v0.8.1/${SCCACHE_ARCHIVE}.tar.gz" \ && tar -xzf sccache.tar.gz \ - && sudo mv sccache-v0.8.1-x86_64-unknown-linux-musl/sccache /usr/bin/sccache \ - && rm -rf sccache.tar.gz sccache-v0.8.1-x86_64-unknown-linux-musl \ + && sudo mv "${SCCACHE_ARCHIVE}"/sccache /usr/bin/sccache \ + && rm -rf sccache.tar.gz "${SCCACHE_ARCHIVE}" \ && export SCCACHE_BUCKET=${SCCACHE_BUCKET_NAME} \ && export SCCACHE_REGION=${SCCACHE_REGION_NAME} \ && export SCCACHE_S3_NO_CREDENTIALS=${SCCACHE_S3_NO_CREDENTIALS} \ @@ -285,7 +284,7 @@ RUN if command -v apt-get >/dev/null; then \ && ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \ && curl -sS ${GET_PIP_URL} | python${PYTHON_VERSION}; \ else \ - dnf install -y git curl wget sudo vim; \ + dnf install -y git curl wget sudo; \ fi \ && python3 --version && python3 -m pip --version @@ -298,12 +297,6 @@ RUN echo "[INFO] Listing current directory before torch install step:" && \ echo "[INFO] Showing torch_build_versions.txt content:" && \ cat torch_build_versions.txt -# Workaround for https://github.com/openai/triton/issues/2507 and -# https://github.com/pytorch/pytorch/issues/107960 -- hopefully -# this won't be needed for future versions of this docker image -# or future versions of triton. -RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/ - # Install uv for faster pip installs if not existed RUN --mount=type=cache,target=/root/.cache/uv \ if ! python3 -m uv --version > /dev/null 2>&1; then \ diff --git a/.github/ci_configs/vllm/use_existing_torch.py b/.github/ci_configs/vllm/use_existing_torch.py new file mode 100644 index 0000000000000..f55db97850d9c --- /dev/null +++ b/.github/ci_configs/vllm/use_existing_torch.py @@ -0,0 +1,17 @@ +import glob + + +requires_files = glob.glob("requirements/*.txt") +requires_files += ["pyproject.toml"] +for file in requires_files: + print(f">>> cleaning {file}") + with open(file) as f: + lines = f.readlines() + if "torch" in "".join(lines).lower(): + print("removed:") + with open(file, "w") as f: + for line in lines: + if "torch" not in line.lower(): + f.write(line) + print(f"<<< done cleaning {file}") + print() diff --git a/.github/labeler.yml b/.github/labeler.yml index 8b1acc77c267f..eb4076d81331d 100644 --- a/.github/labeler.yml +++ b/.github/labeler.yml @@ -130,3 +130,6 @@ - torch/csrc/inductor/aoti_include/** - torchgen/aoti/** - torchgen/gen_aoti_c_shim.py + +"ciflow/vllm": +- .github/ci_commit_pins/vllm.txt diff --git a/.github/pytorch-probot.yml b/.github/pytorch-probot.yml index a0aa6921b92ba..9f0937eb9f04b 100644 --- a/.github/pytorch-probot.yml +++ b/.github/pytorch-probot.yml @@ -19,6 +19,7 @@ ciflow_push_tags: - ciflow/nightly - ciflow/periodic - ciflow/periodic-rocm-mi300 +- ciflow/quantization-periodic - ciflow/rocm - ciflow/rocm-mi300 - ciflow/s390 @@ -36,6 +37,7 @@ ciflow_push_tags: - ciflow/win-arm64 - ciflow/h100-symm-mem - ciflow/h100-cutlass-backend +- ciflow/b200 retryable_workflows: - pull - trunk diff --git a/.github/requirements/pip-requirements-macOS.txt b/.github/requirements/pip-requirements-macOS.txt index 3a27cac46f71f..5fc26302a0add 100644 --- a/.github/requirements/pip-requirements-macOS.txt +++ b/.github/requirements/pip-requirements-macOS.txt @@ -15,7 +15,7 @@ optree==0.13.0 packaging==23.1 parameterized==0.8.1 pillow==10.3.0 -protobuf==5.29.4 +protobuf==5.29.5 psutil==5.9.8 pygments==2.15.0 pytest-cpp==2.3.0 @@ -26,7 +26,7 @@ pytest-xdist==3.3.1 pytest==7.3.2 pyyaml==6.0.2 scipy==1.12.0 -setuptools==72.1.0 +setuptools==78.1.1 sympy==1.13.3 tlparse==0.4.0 tensorboard==2.13.0 diff --git a/.github/scripts/docathon-label-sync.py b/.github/scripts/docathon-label-sync.py index ccd2eb0f4bd0f..04f4707a55c3f 100644 --- a/.github/scripts/docathon-label-sync.py +++ b/.github/scripts/docathon-label-sync.py @@ -39,7 +39,9 @@ def main() -> None: pull_request_label_names = [label.name for label in pull_request_labels] issue_label_names = [label.name for label in issue_labels] labels_to_add = [ - label for label in issue_label_names if label not in pull_request_label_names + label + for label in issue_label_names + if label not in pull_request_label_names and label != "actionable" ] if not labels_to_add: print("The pull request already has the same labels.") diff --git a/.github/scripts/generate_binary_build_matrix.py b/.github/scripts/generate_binary_build_matrix.py index 4a4f8a65f684d..e57c2d5ef0749 100644 --- a/.github/scripts/generate_binary_build_matrix.py +++ b/.github/scripts/generate_binary_build_matrix.py @@ -43,55 +43,55 @@ PYTORCH_EXTRA_INSTALL_REQUIREMENTS = { "12.6": ( - "nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | " - "nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | " - "nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | " - "nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | " - "nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | " - "nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | " - "nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | " - "nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | " - "nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | " - "nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | " - "nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | " - "nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | " - "nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | " - "nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | " - "nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'" + "nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | " + "nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | " + "nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | " + "nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | " + "nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | " + "nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | " + "nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | " + "nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | " + "nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | " + "nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | " + "nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | " + "nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | " + "nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | " + "nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | " + "nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'" ), "12.8": ( - "nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | " - "nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | " - "nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | " - "nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | " - "nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | " - "nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | " - "nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | " - "nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | " - "nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | " - "nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | " - "nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | " - "nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | " - "nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | " - "nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | " - "nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'" + "nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | " + "nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | " + "nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | " + "nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | " + "nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | " + "nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | " + "nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | " + "nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | " + "nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | " + "nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | " + "nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | " + "nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | " + "nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | " + "nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | " + "nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'" ), "13.0": ( - "nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | " - "nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | " - "nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | " - "nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' and platform_machine == 'x86_64' | " - "nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | " - "nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | " - "nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | " - "nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | " - "nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | " - "nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | " - "nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | " - "nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | " - "nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | " - "nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | " - "nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'" + "nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | " + "nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | " + "nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | " + "nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | " + "nvidia-cublas==13.0.0.19; platform_system == 'Linux' | " + "nvidia-cufft==12.0.0.15; platform_system == 'Linux' | " + "nvidia-curand==10.4.0.35; platform_system == 'Linux' | " + "nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | " + "nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | " + "nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | " + "nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | " + "nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | " + "nvidia-nvtx==13.0.39; platform_system == 'Linux' | " + "nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | " + "nvidia-cufile==1.15.0.42; platform_system == 'Linux'" ), "xpu": ( "intel-cmplr-lib-rt==2025.2.1 | " diff --git a/.github/scripts/generate_ci_workflows.py b/.github/scripts/generate_ci_workflows.py index 67906d4ad88d5..0396c405ad0a7 100755 --- a/.github/scripts/generate_ci_workflows.py +++ b/.github/scripts/generate_ci_workflows.py @@ -135,7 +135,7 @@ class OperatingSystem: build_configs=generate_binary_build_matrix.generate_wheels_matrix( OperatingSystem.LINUX, arches=["6.4"], - python_versions=["3.9"], + python_versions=["3.10"], ), ciflow_config=CIFlowConfig( labels={ diff --git a/.github/scripts/prepare_vllm_wheels.sh b/.github/scripts/prepare_vllm_wheels.sh new file mode 100755 index 0000000000000..62362c7ff207c --- /dev/null +++ b/.github/scripts/prepare_vllm_wheels.sh @@ -0,0 +1,94 @@ +#!/usr/bin/env bash + +set -eux + +torch_version=$(unzip -p torch-* '**/METADATA' | grep '^Version: ' | cut -d' ' -f2) +nightly=$(echo ${torch_version} | cut -d'.' -f4) + +# Copied from .ci/manywheel/build_common.sh +make_wheel_record() { + fpath=$1 + if echo $fpath | grep RECORD >/dev/null 2>&1; then + echo "$fpath,," + else + fhash=$(openssl dgst -sha256 -binary $fpath | openssl base64 | sed -e 's/+/-/g' | sed -e 's/\//_/g' | sed -e 's/=//g') + fsize=$(ls -nl $fpath | awk '{print $5}') + echo "$fpath,sha256=$fhash,$fsize" + fi +} + +change_wheel_version() { + local package=$1 + local wheel=$2 + local f_version=$3 + local t_version=$4 + + # Extract the wheel + ${PYTHON_EXECUTABLE} -mwheel unpack $wheel + + mv "${package}-${f_version}" "${package}-${t_version}" + # Change the version from f_version to t_version in the dist-info dir + pushd "${package}-${t_version}" + mv "${package}-${f_version}.dist-info" "${package}-${t_version}.dist-info" + + pushd "${package}-${t_version}.dist-info" + sed -i "s/${package}-${f_version}.dist-info/${package}-${t_version}.dist-info/g" RECORD + + # Update the version in METADATA and its SHA256 hash + sed -i "s/Version: ${f_version}/Version: ${t_version}/g" METADATA + # then add PyTorch nightly dependency of vLLM + if [[ "${package}" == vllm ]] || [[ "${package}" == xformers ]]; then + sed -i "/License-File/a\Requires-Dist: torch==${torch_version}" METADATA + fi + sed -i '/METADATA,sha256/d' RECORD + popd + + make_wheel_record "${package}-${t_version}.dist-info/METADATA" >> "${package}-${t_version}.dist-info/RECORD" + popd + + # Repack the wheel + ${PYTHON_EXECUTABLE} -mwheel pack "${package}-${t_version}" + + # Clean up + rm -rf "${package}-${t_version}" +} + +repackage_wheel() { + local package=$1 + pushd $package + + local orig_wheel=$(find . -name *${package//-/_}*) + local orig_version=$(unzip -p $orig_wheel '**/METADATA' | grep '^Version: ' | cut -d' ' -f2) + + local version="" + if [[ "${package}" == vllm ]]; then + # Copied from vllm/.buildkite/scripts/upload-wheels.sh + version=1.0.0 + else + version=$(echo $orig_version | tr '.+' '.' | cut -d'.' -f1-3) + fi + local nightly_version=$version.$nightly + + # Use nightly version + change_wheel_version ${package//-/_} $orig_wheel $orig_version $nightly_version + # Clean up + rm "${orig_wheel}" + + auditwheel repair --plat $PLATFORM *.whl \ + --exclude libc10* --exclude libtorch* --exclude libcu* --exclude libnv* + local repair_wheel=$(find wheelhouse -name *${PLATFORM}*) + local repair_wheel=$(basename ${repair_wheel}) + popd + + cp ${package}/wheelhouse/${repair_wheel} . + rm -rf $package +} + +# Require to re-package the wheel +${PYTHON_EXECUTABLE} -mpip install wheel==0.45.1 + +pushd externals/vllm/wheels +for package in xformers flashinfer-python vllm; do + repackage_wheel $package +done +popd diff --git a/.github/templates/linux_binary_build_workflow.yml.j2 b/.github/templates/linux_binary_build_workflow.yml.j2 index fee9ca2eac120..a0f8befddf39e 100644 --- a/.github/templates/linux_binary_build_workflow.yml.j2 +++ b/.github/templates/linux_binary_build_workflow.yml.j2 @@ -71,7 +71,7 @@ jobs: with:!{{ upload.binary_env_as_input(config) }} {%- if "aarch64" in build_environment %} runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - runs_on: linux.arm64.m7g.4xlarge.ephemeral + runs_on: linux.arm64.r7g.12xlarge.memory ALPINE_IMAGE: "arm64v8/alpine" {%- elif "s390x" in build_environment %} runs_on: linux.s390x diff --git a/.github/templates/macos_binary_build_workflow.yml.j2 b/.github/templates/macos_binary_build_workflow.yml.j2 index f4b2a66d2acda..7f307447c3576 100644 --- a/.github/templates/macos_binary_build_workflow.yml.j2 +++ b/.github/templates/macos_binary_build_workflow.yml.j2 @@ -22,6 +22,16 @@ name: !{{ build_environment }} echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}" {%- endmacro %} +{%- macro setup_python(py_ver) -%} + - name: Setup Python + uses: actions/setup-python@v6 + with: + # TODO: Removeme once 3.14 is out + # .4 version is min minor for 3.10, and also no-gil version of 3.13 needs at least 3.13.3 + python-version: "!{{ (py_ver.strip('t') + '.4') if '3.14' not in py_ver else '3.14.0-rc.2' }}" + freethreaded: !{{ "true" if py_ver.endswith('t') else "false" }} +{%- endmacro %} + on: # TODO: Migrate to new ciflow trigger, reference https://github.com/pytorch/pytorch/pull/70321 push: @@ -61,23 +71,13 @@ jobs: {%- endif %} steps: !{{ set_runner_specific_vars() }} - - name: Install conda and dependencies - run: | - # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on - curl --retry 3 --retry-all-errors -o "${RUNNER_TEMP}/conda.sh" "https://repo.anaconda.com/miniconda/Miniconda3-py310_23.5.2-0-MacOSX-$(uname -m).sh" - chmod +x "${RUNNER_TEMP}/conda.sh" - /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda" - echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}" + !{{ setup_python(config.get("python_version", "3.10")) }} !{{ common.checkout(deep_clone=False, directory="pytorch") }} - name: Populate binary env run: | - # shellcheck disable=SC1091 - source "${RUNNER_TEMP}/anaconda/bin/activate" "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" - name: Build PyTorch binary run: | - # shellcheck disable=SC1091 - source "${RUNNER_TEMP}/anaconda/bin/activate" set -eux -o pipefail # shellcheck disable=SC1090 source "${BINARY_ENV_FILE:-/Users/distiller/project/env}" @@ -94,8 +94,6 @@ jobs: {%- if config["package_type"] == "wheel" %} - name: Test PyTorch wheel run: | - # shellcheck disable=SC1091 - source "${RUNNER_TEMP}/anaconda/bin/activate" set -eux -o pipefail # shellcheck disable=SC1090 source "${BINARY_ENV_FILE:-/Users/distiller/project/env}" @@ -106,33 +104,9 @@ jobs: SMOKE_TEST_PARAMS="" - EXTRA_CONDA_INSTALL_FLAGS="" - CONDA_ENV_CREATE_FLAGS="" - # shellcheck disable=SC2153 - case $DESIRED_PYTHON in - 3.14t) - CONDA_ENV_CREATE_FLAGS="python-freethreading" - EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge" - desired_python="3.14.0rc1" - ;; - 3.14) - EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge" - desired_python="3.14.0rc1" - ;; - 3.13t) - CONDA_ENV_CREATE_FLAGS="python-freethreading" - EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge" - desired_python="3.13" - ;; - *) - # shellcheck disable=SC2153 - desired_python=${DESIRED_PYTHON} - ;; - esac - # shellcheck disable=SC2086 - conda create -yn "test_conda_env" python="$desired_python" ${CONDA_ENV_CREATE_FLAGS} ${EXTRA_CONDA_INSTALL_FLAGS} - conda activate test_conda_env + python -mvenv test_venv + source test_venv/bin/activate pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v # shellcheck disable=SC2086 diff --git a/.github/workflows/_binary-test-linux.yml b/.github/workflows/_binary-test-linux.yml index 2d9e4d0e27b25..476dd182db0f8 100644 --- a/.github/workflows/_binary-test-linux.yml +++ b/.github/workflows/_binary-test-linux.yml @@ -187,8 +187,6 @@ jobs: - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG uses: pytorch/test-infra/.github/actions/setup-nvidia@main - with: - driver-version: ${{ startsWith(inputs.GPU_ARCH_VERSION, '13') && '580.65.06' || '570.133.07' }} if: ${{ inputs.GPU_ARCH_TYPE == 'cuda' && steps.filter.outputs.is-test-matrix-empty == 'False' }} - name: configure aws credentials diff --git a/.github/workflows/_docs.yml b/.github/workflows/_docs.yml index ff5dbe604bac1..aba3fa3dceec2 100644 --- a/.github/workflows/_docs.yml +++ b/.github/workflows/_docs.yml @@ -75,10 +75,6 @@ jobs: runner: ${{ inputs.runner_prefix }}linux.2xlarge # It takes less than 30m to finish python docs unless there are issues timeout-minutes: 30 - - docs_type: functorch - runner: ${{ inputs.runner_prefix }}linux.2xlarge - # It takes less than 15m to finish functorch docs unless there are issues - timeout-minutes: 15 # Set a fixed name for this job instead of using the current matrix-generated name, i.e. build-docs (cpp, linux.12xlarge, 180) # The current name requires updating the database last docs push query from test-infra every time the matrix is updated name: build-docs-${{ matrix.docs_type }}-${{ inputs.push }} @@ -211,16 +207,6 @@ jobs: path: cppdocs/ s3-prefix: pytorch/pytorch/${{ github.event.pull_request.number }}/cppdocs - - name: Upload functorch Docs Preview - uses: seemethere/upload-artifact-s3@baba72d0712b404f646cebe0730933554ebce96a # v5.1.0 - if: ${{ github.event_name == 'pull_request' && matrix.docs_type == 'functorch' && steps.build-docs.outcome == 'success' }} - with: - retention-days: 14 - s3-bucket: doc-previews - if-no-files-found: error - path: functorch_ghpages/nightly/ - s3-prefix: pytorch/pytorch/${{ github.event.pull_request.number }}/functorchdocs - - name: Teardown Linux uses: pytorch/test-infra/.github/actions/teardown-linux@main if: always() diff --git a/.github/workflows/_get-changed-files.yml b/.github/workflows/_get-changed-files.yml index 55712b0652702..311c594a11eff 100644 --- a/.github/workflows/_get-changed-files.yml +++ b/.github/workflows/_get-changed-files.yml @@ -2,6 +2,12 @@ name: Get Changed Files on: workflow_call: + inputs: + all_files: + description: "Whether to return all files instead of just changed files" + required: false + type: boolean + default: false outputs: changed-files: description: "List of changed files (space-separated) or '*' if not in a PR" @@ -26,17 +32,23 @@ jobs: # Get the PR number from the github context PR_NUMBER="${{ github.event.number }}" - # Use gh CLI to get changed files in the PR with explicit repo - CHANGED_FILES=$(gh api repos/${{ github.repository }}/pulls/$PR_NUMBER/files --paginate --jq '.[] | select(.status != "removed") | .filename' | tr '\n' ' ' | sed 's/ $//') - - if [ -z "$CHANGED_FILES" ]; then - echo "No changed files found, setting to '*'" - CHANGED_FILES="*" + # Check if all_files is requested + if [ "${{ inputs.all_files }}" = "true" ]; then + echo "all_files input is true, returning all files" + echo "changed-files=*" >> "$GITHUB_OUTPUT" + else + # Use gh CLI to get changed files in the PR with explicit repo + CHANGED_FILES=$(gh api repos/${{ github.repository }}/pulls/$PR_NUMBER/files --paginate --jq '.[] | select(.status != "removed") | .filename' | tr '\n' ' ' | sed 's/ $//') + + if [ -z "$CHANGED_FILES" ]; then + echo "No changed files found, setting to '*'" + CHANGED_FILES="*" + fi + + echo "Changed files: $CHANGED_FILES" + echo "changed-files=$CHANGED_FILES" >> "$GITHUB_OUTPUT" fi - echo "Changed files: $CHANGED_FILES" - echo "changed-files=$CHANGED_FILES" >> "$GITHUB_OUTPUT" - else echo "Not in PR context, setting changed files to '*'" echo "changed-files=*" >> "$GITHUB_OUTPUT" diff --git a/.github/workflows/_linux-test.yml b/.github/workflows/_linux-test.yml index 66579b573a63d..537e94488b363 100644 --- a/.github/workflows/_linux-test.yml +++ b/.github/workflows/_linux-test.yml @@ -169,7 +169,7 @@ jobs: id: install-nvidia-driver uses: pytorch/test-infra/.github/actions/setup-nvidia@main with: - driver-version: ${{ matrix.config == 'legacy_nvidia_driver' && '525.105.17' || '570.133.07' }} + driver-version: ${{ matrix.config == 'legacy_nvidia_driver' && '525.105.17' || '580.82.07' }} if: ${{ contains(inputs.build-environment, 'cuda') && !contains(matrix.config, 'nogpu') && steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false' && !contains(matrix.runner, 'b200') }} - name: Setup GPU_FLAG for docker run diff --git a/.github/workflows/_rocm-test.yml b/.github/workflows/_rocm-test.yml index f73972942b5f9..7781e1f65fd16 100644 --- a/.github/workflows/_rocm-test.yml +++ b/.github/workflows/_rocm-test.yml @@ -62,6 +62,11 @@ on: required: false type: number default: 1 + secrets: + HUGGING_FACE_HUB_TOKEN: + required: false + description: | + HF Auth token to avoid rate limits when downloading models or datasets from hub env: GIT_DEFAULT_BRANCH: ${{ github.event.repository.default_branch }} @@ -76,10 +81,9 @@ jobs: strategy: matrix: ${{ fromJSON(inputs.test-matrix) }} fail-fast: false - timeout-minutes: ${{ matrix.mem_leak_check == 'mem_leak_check' && 600 || inputs.timeout-minutes }} runs-on: ${{ matrix.runner }} + timeout-minutes: ${{ matrix.mem_leak_check == 'mem_leak_check' && 600 || inputs.timeout-minutes }} steps: - # [see note: pytorch repo ref] - name: Checkout PyTorch uses: pytorch/pytorch/.github/actions/checkout-pytorch@main with: @@ -131,6 +135,9 @@ jobs: - name: Start monitoring script id: monitor-script + if: ${{ !inputs.disable-monitor }} + shell: bash + continue-on-error: true env: JOB_ID: ${{ steps.get-job-id.outputs.job-id }} JOB_NAME: ${{ steps.get-job-id.outputs.job-name }} @@ -138,9 +145,6 @@ jobs: WORKFLOW_RUN_ID: ${{github.run_id}} MONITOR_LOG_INTERVAL: ${{ inputs.monitor-log-interval }} MONITOR_DATA_COLLECT_INTERVAL: ${{ inputs.monitor-data-collect-interval }} - if: ${{ !inputs.disable-monitor }} - shell: bash - continue-on-error: true run: | python3 -m pip install psutil==5.9.8 dataclasses_json==0.6.7 python3 -m tools.stats.monitor --log-interval "$MONITOR_LOG_INTERVAL" --data-collect-interval "$MONITOR_DATA_COLLECT_INTERVAL" > usage_log.txt 2>&1 & @@ -178,6 +182,12 @@ jobs: run: | echo "timeout=$((JOB_TIMEOUT-30))" >> "${GITHUB_OUTPUT}" + - name: Preserve github env variables for use in docker + shell: bash + run: | + env | grep '^GITHUB' >> "/tmp/github_env_${GITHUB_RUN_ID}" + env | grep '^CI' >> "/tmp/github_env_${GITHUB_RUN_ID}" + - name: Test id: test env: @@ -193,20 +203,22 @@ jobs: JOB_NAME: ${{ steps.get-job-id.outputs.job-name }} BRANCH: ${{ steps.parse-ref.outputs.branch }} SHA1: ${{ github.event.pull_request.head.sha || github.sha }} + BASE_SHA: ${{ github.event.pull_request.base.sha || github.sha }} + TEST_CONFIG: ${{ matrix.config }} + SHARD_NUMBER: ${{ matrix.shard }} + NUM_TEST_SHARDS: ${{ matrix.num_shards }} + REENABLED_ISSUES: ${{ steps.keep-going.outputs.reenabled-issues }} CONTINUE_THROUGH_ERROR: ${{ steps.keep-going.outputs.keep-going }} VERBOSE_TEST_LOGS: ${{ steps.keep-going.outputs.ci-verbose-test-logs }} TEST_SHOWLOCALS: ${{ steps.keep-going.outputs.ci-test-showlocals }} NO_TEST_TIMEOUT: ${{ steps.keep-going.outputs.ci-no-test-timeout }} NO_TD: ${{ steps.keep-going.outputs.ci-no-td }} - TEST_CONFIG: ${{ matrix.config }} - SHARD_NUMBER: ${{ matrix.shard }} - NUM_TEST_SHARDS: ${{ matrix.num_shards }} - REENABLED_ISSUES: ${{ steps.keep-going.outputs.reenabled-issues }} DOCKER_IMAGE: ${{ inputs.docker-image }} PYTORCH_TEST_CUDA_MEM_LEAK_CHECK: ${{ matrix.mem_leak_check && '1' || '0' }} PYTORCH_TEST_RERUN_DISABLED_TESTS: ${{ matrix.rerun_disabled_tests && '1' || '0' }} TESTS_TO_INCLUDE: ${{ inputs.tests-to-include }} DASHBOARD_TAG: ${{ inputs.dashboard-tag }} + HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} timeout-minutes: ${{ fromJson(steps.test-timeout.outputs.timeout) }} run: | set -x @@ -236,6 +248,7 @@ jobs: -e GITHUB_RUN_ATTEMPT \ -e JOB_ID \ -e JOB_NAME \ + -e BASE_SHA \ -e BRANCH \ -e SHA1 \ -e AWS_DEFAULT_REGION \ @@ -253,10 +266,12 @@ jobs: -e PYTORCH_TEST_CUDA_MEM_LEAK_CHECK \ -e PYTORCH_TEST_RERUN_DISABLED_TESTS \ -e TESTS_TO_INCLUDE \ + -e HUGGING_FACE_HUB_TOKEN \ -e DASHBOARD_TAG \ --env-file="${RUNNER_TEMP}/github_env_${GITHUB_RUN_ID}" \ --ulimit stack=10485760:83886080 \ --ulimit core=0 \ + --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ --security-opt seccomp=unconfined \ --cap-add=SYS_PTRACE \ --shm-size="8g" \ diff --git a/.github/workflows/_win-build.yml b/.github/workflows/_win-build.yml index 7067d79eb0758..d447dba4a511c 100644 --- a/.github/workflows/_win-build.yml +++ b/.github/workflows/_win-build.yml @@ -151,7 +151,7 @@ jobs: BUILD_WHEEL: 1 MAX_JOBS: 8 CUDA_VERSION: ${{ inputs.cuda-version }} - PYTHON_VERSION: "3.9" + PYTHON_VERSION: "3.10" SCCACHE_BUCKET: "ossci-compiler-cache" SCCACHE_S3_KEY_PREFIX: ${{ github.workflow }} SCCACHE_REGION: us-east-1 diff --git a/.github/workflows/_win-test.yml b/.github/workflows/_win-test.yml index 5049ef61f6930..a93f10c123aac 100644 --- a/.github/workflows/_win-test.yml +++ b/.github/workflows/_win-test.yml @@ -184,7 +184,7 @@ jobs: env: USE_CUDA: ${{ inputs.cuda-version != 'cpu' && '1' || '0' }} INSTALL_WINDOWS_SDK: 1 - PYTHON_VERSION: 3.9 + PYTHON_VERSION: "3.10" CONTINUE_THROUGH_ERROR: ${{ steps.keep-going.outputs.keep-going }} VERBOSE_TEST_LOGS: ${{ steps.keep-going.outputs.ci-verbose-test-logs }} TEST_SHOWLOCALS: ${{ steps.keep-going.outputs.ci-test-showlocals }} diff --git a/.github/workflows/build-triton-wheel.yml b/.github/workflows/build-triton-wheel.yml index 932d9c8863027..b9ccc6fc361a7 100644 --- a/.github/workflows/build-triton-wheel.yml +++ b/.github/workflows/build-triton-wheel.yml @@ -50,7 +50,7 @@ jobs: strategy: fail-fast: false matrix: - py_vers: [ "3.9", "3.10", "3.11", "3.12", "3.13", "3.13t", "3.14", "3.14t" ] + py_vers: [ "3.10", "3.11", "3.12", "3.13", "3.13t", "3.14", "3.14t" ] device: ["cuda", "rocm", "xpu", "aarch64"] docker-image: ["pytorch/manylinux2_28-builder:cpu"] include: @@ -108,9 +108,6 @@ jobs: # Determine python executable for given version case $PY_VERS in - 3.9) - PYTHON_EXECUTABLE=/opt/python/cp39-cp39/bin/python - ;; 3.10) PYTHON_EXECUTABLE=/opt/python/cp310-cp310/bin/python ;; @@ -194,7 +191,7 @@ jobs: strategy: fail-fast: false matrix: - py_vers: [ "3.9", "3.10", "3.11", "3.12", "3.13", "3.13t", "3.14", "3.14t" ] + py_vers: [ "3.10", "3.11", "3.12", "3.13", "3.13t", "3.14", "3.14t" ] device: ["xpu"] timeout-minutes: 40 env: diff --git a/.github/workflows/build-vllm-wheel.yml b/.github/workflows/build-vllm-wheel.yml index 658e02ede6fbd..2c66353748417 100644 --- a/.github/workflows/build-vllm-wheel.yml +++ b/.github/workflows/build-vllm-wheel.yml @@ -12,6 +12,9 @@ on: paths: - .github/workflows/build-vllm-wheel.yml - .github/ci_commit_pins/vllm.txt + schedule: + # every morning at 01:30PM UTC, 9:30AM EST, 6:30AM PST + - cron: 30 13 * * * concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} @@ -24,21 +27,33 @@ jobs: fail-fast: false matrix: python-version: [ '3.12' ] - # TODO (huydhn): Add cu130 https://github.com/pytorch/pytorch/pull/162000#issuecomment-3261541554 + # TODO (huydhn): Add cu130 after https://github.com/vllm-project/vllm/issues/24464 is resolved + platform: [ 'manylinux_2_28_x86_64', 'manylinux_2_28_aarch64' ] device: [ 'cu128', 'cu129' ] - runner: [ 'linux.12xlarge.memory' ] include: - - device: cu128 + - platform: manylinux_2_28_x86_64 + device: cu128 manylinux-image: 'pytorch/manylinux2_28-builder:cuda12.8' - - device: cu129 + runner: linux.12xlarge.memory + - platform: manylinux_2_28_x86_64 + device: cu129 manylinux-image: 'pytorch/manylinux2_28-builder:cuda12.9' - name: "Build ${{ matrix.device }} vLLM wheel" + runner: linux.12xlarge.memory + - platform: manylinux_2_28_aarch64 + device: cu128 + manylinux-image: 'pytorch/manylinuxaarch64-builder:cuda12.8' + runner: linux.arm64.r7g.12xlarge.memory + - platform: manylinux_2_28_aarch64 + device: cu129 + manylinux-image: 'pytorch/manylinuxaarch64-builder:cuda12.9' + runner: linux.arm64.r7g.12xlarge.memory + name: "Build ${{ matrix.device }} vLLM wheel on ${{ matrix.platform }}" runs-on: ${{ matrix.runner }} timeout-minutes: 480 env: PY_VERS: ${{ matrix.python-version }} MANYLINUX_IMAGE: ${{ matrix.manylinux-image }} - PLATFORM: 'manylinux_2_28_x86_64' + PLATFORM: ${{ matrix.platform }} BUILD_DEVICE: ${{ matrix.device }} steps: - name: Setup SSH (Click me for login details) @@ -59,20 +74,6 @@ jobs: run: | set -eux - # Keep PyTorch nightly wheel here so that we can install it later during - # vLLM build process - mkdir -p "${RUNNER_TEMP}/artifacts/" - - container_name=$(docker run \ - --tty \ - --detach \ - -e PLATFORM \ - -v "${GITHUB_WORKSPACE}:/pytorch" \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -w /artifacts/ \ - "${MANYLINUX_IMAGE}" - ) - # Determine python executable for given version (copied from build-triton-wheel) case $PY_VERS in 3.10) @@ -102,6 +103,21 @@ jobs: ;; esac + # Keep PyTorch nightly wheel here so that we can install it later during + # vLLM build process + mkdir -p "${RUNNER_TEMP}/artifacts/" + + container_name=$(docker run \ + --tty \ + --detach \ + -e PLATFORM \ + -e PYTHON_EXECUTABLE="${PYTHON_EXECUTABLE}" \ + -v "${GITHUB_WORKSPACE}:/pytorch" \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -w /artifacts/ \ + "${MANYLINUX_IMAGE}" + ) + docker exec -t "${container_name}" "${PYTHON_EXECUTABLE}" -mpip install \ --pre torch torchvision torchaudio \ --index-url "https://download.pytorch.org/whl/nightly/${BUILD_DEVICE}" @@ -113,7 +129,6 @@ jobs: --index-url "https://download.pytorch.org/whl/nightly/${BUILD_DEVICE}" # Save this for later - echo "PYTHON_EXECUTABLE=${PYTHON_EXECUTABLE}" >> "$GITHUB_ENV" echo "container_name=${container_name}" >> "$GITHUB_ENV" - name: Build vLLM wheel @@ -131,41 +146,12 @@ jobs: set -eux # Get these wheels ready, the vllm renaming logic is copied from its .buildkite/scripts/upload-wheels.sh - docker exec -t "${container_name}" bash -c " - set -eux - - nightly=\$(unzip -p torch-* '**/METADATA' | grep '^Version: ' | cut -d' ' -f2 | cut -d'.' -f4) - - pushd externals/vllm/wheels - for package in xformers flashinfer-python vllm; do - pushd \$package - auditwheel repair --plat \$PLATFORM *.whl \ - --exclude libc10* --exclude libtorch* --exclude libcu* --exclude libnv* - repair_wheel=\$(find wheelhouse -name *\${PLATFORM}*) - repair_wheel=\$(basename \${repair_wheel}) - popd - - cp \${package}/wheelhouse/\${repair_wheel} . - version=\$(unzip -p \$repair_wheel '**/METADATA' | grep '^Version: ' | cut -d' ' -f2) - - if [[ \$package == vllm ]]; then - new_wheel=\${repair_wheel/\$version/1.0.0.\$nightly} - else - major_version=\$(echo \$version | tr '.+' '.' | cut -d'.' -f1-3) - new_wheel=\${repair_wheel/\$version/\$major_version.\$nightly} - fi - - mv -- \$repair_wheel \$new_wheel - rm -rf \$package - done - popd - " - + docker exec -t "${container_name}" bash -c /pytorch/.github/scripts/prepare_vllm_wheels.sh docker exec -t "${container_name}" chown -R 1000:1000 /artifacts - uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0 with: - name: vllm-wheel-${{ matrix.device }}-${{ matrix.python-version }}-${{ env.PLATFORM }} + name: vllm-wheel-${{ matrix.device }}-${{ matrix.platform }}-${{ matrix.python-version }} if-no-files-found: error path: ${{ runner.temp }}/artifacts/externals/vllm/wheels/*.whl @@ -175,27 +161,29 @@ jobs: # Copied from build-triton-wheel workflow (mostly) upload-wheel: - name: "Upload ${{ matrix.device }} vLLM wheel" + name: "Upload ${{ matrix.device }} vLLM wheel on ${{ matrix.platform }}" needs: - build-wheel runs-on: ubuntu-latest strategy: fail-fast: false matrix: + platform: [ 'manylinux_2_28_x86_64', 'manylinux_2_28_aarch64' ] device: [ 'cu128', 'cu129' ] env: + PLATFORM: ${{ matrix.platform }} BUILD_DEVICE: ${{ matrix.device }} permissions: id-token: write contents: read container: image: continuumio/miniconda3:4.12.0 - environment: ${{ (github.event_name == 'push' && github.event.ref == 'refs/heads/main') && 'nightly-wheel-upload' || '' }} + environment: ${{ ((github.event_name == 'push' && github.event.ref == 'refs/heads/main') || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') && 'nightly-wheel-upload' || '' }} steps: - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - name: Configure AWS credentials(PyTorch account) for main - if: ${{ github.event_name == 'push' && github.event.ref == 'refs/heads/main' }} + if: ${{ (github.event_name == 'push' && github.event.ref == 'refs/heads/main') || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' }} uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0 with: role-to-assume: arn:aws:iam::749337293305:role/gha_workflow_nightly_build_wheels @@ -219,15 +207,15 @@ jobs: run: | set -eux mkdir -p "${RUNNER_TEMP}/artifacts/" - mv "${RUNNER_TEMP}"/artifacts-all/vllm-wheel-"${BUILD_DEVICE}"-*/* "${RUNNER_TEMP}/artifacts/" + mv "${RUNNER_TEMP}"/artifacts-all/vllm-wheel-"${BUILD_DEVICE}"-"${PLATFORM}"-*/* "${RUNNER_TEMP}/artifacts/" - - name: Set DRY_RUN (only for tagged pushes) - if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) }} + - name: Set DRY_RUN + if: ${{ (github.event_name == 'push' && (github.event.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v'))) || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' }} shell: bash run: | echo "DRY_RUN=disabled" >> "$GITHUB_ENV" - - name: Set UPLOAD_CHANNEL (only for tagged pushes) + - name: Set UPLOAD_CHANNEL if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/v') }} shell: bash run: | diff --git a/.github/workflows/docker-builds.yml b/.github/workflows/docker-builds.yml index 492f41775d9de..272a2d1c691db 100644 --- a/.github/workflows/docker-builds.yml +++ b/.github/workflows/docker-builds.yml @@ -70,9 +70,8 @@ jobs: pytorch-linux-jammy-py3-clang18-asan, pytorch-linux-jammy-py3-clang12-onnx, pytorch-linux-jammy-linter, - pytorch-linux-jammy-cuda12.8-cudnn9-py3.9-linter, - # Executorch pin needs update - # pytorch-linux-jammy-py3-clang12-executorch, + pytorch-linux-jammy-cuda12.8-cudnn9-py3.10-linter, + pytorch-linux-jammy-py3-clang12-executorch, pytorch-linux-jammy-py3.12-triton-cpu, pytorch-linux-noble-riscv64-py3.12-gcc14 ] diff --git a/.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml b/.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml index 860ee21cda6a7..651b034b2edc1 100644 --- a/.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml +++ b/.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml @@ -62,7 +62,7 @@ jobs: DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64 DESIRED_PYTHON: "3.10" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - runs_on: linux.arm64.m7g.4xlarge.ephemeral + runs_on: linux.arm64.r7g.12xlarge.memory ALPINE_IMAGE: "arm64v8/alpine" build_name: manywheel-py3_10-cpu-aarch64 build_environment: linux-aarch64-binary-manywheel @@ -128,11 +128,11 @@ jobs: DOCKER_IMAGE_TAG_PREFIX: cuda12.6 DESIRED_PYTHON: "3.10" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - runs_on: linux.arm64.m7g.4xlarge.ephemeral + runs_on: linux.arm64.r7g.12xlarge.memory ALPINE_IMAGE: "arm64v8/alpine" build_name: manywheel-py3_10-cuda-aarch64-12_6 build_environment: linux-aarch64-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' timeout-minutes: 420 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} @@ -174,11 +174,11 @@ jobs: DOCKER_IMAGE_TAG_PREFIX: cuda12.8 DESIRED_PYTHON: "3.10" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - runs_on: linux.arm64.m7g.4xlarge.ephemeral + runs_on: linux.arm64.r7g.12xlarge.memory ALPINE_IMAGE: "arm64v8/alpine" build_name: manywheel-py3_10-cuda-aarch64-12_8 build_environment: linux-aarch64-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' timeout-minutes: 420 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} @@ -220,11 +220,11 @@ jobs: DOCKER_IMAGE_TAG_PREFIX: cuda13.0 DESIRED_PYTHON: "3.10" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - runs_on: linux.arm64.m7g.4xlarge.ephemeral + runs_on: linux.arm64.r7g.12xlarge.memory ALPINE_IMAGE: "arm64v8/alpine" build_name: manywheel-py3_10-cuda-aarch64-13_0 build_environment: linux-aarch64-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' timeout-minutes: 420 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} @@ -265,7 +265,7 @@ jobs: DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64 DESIRED_PYTHON: "3.11" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - runs_on: linux.arm64.m7g.4xlarge.ephemeral + runs_on: linux.arm64.r7g.12xlarge.memory ALPINE_IMAGE: "arm64v8/alpine" build_name: manywheel-py3_11-cpu-aarch64 build_environment: linux-aarch64-binary-manywheel @@ -331,11 +331,11 @@ jobs: DOCKER_IMAGE_TAG_PREFIX: cuda12.6 DESIRED_PYTHON: "3.11" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - runs_on: linux.arm64.m7g.4xlarge.ephemeral + runs_on: linux.arm64.r7g.12xlarge.memory ALPINE_IMAGE: "arm64v8/alpine" build_name: manywheel-py3_11-cuda-aarch64-12_6 build_environment: linux-aarch64-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' timeout-minutes: 420 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} @@ -377,11 +377,11 @@ jobs: DOCKER_IMAGE_TAG_PREFIX: cuda12.8 DESIRED_PYTHON: "3.11" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - runs_on: linux.arm64.m7g.4xlarge.ephemeral + runs_on: linux.arm64.r7g.12xlarge.memory ALPINE_IMAGE: "arm64v8/alpine" build_name: manywheel-py3_11-cuda-aarch64-12_8 build_environment: linux-aarch64-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' timeout-minutes: 420 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} @@ -423,11 +423,11 @@ jobs: DOCKER_IMAGE_TAG_PREFIX: cuda13.0 DESIRED_PYTHON: "3.11" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - runs_on: linux.arm64.m7g.4xlarge.ephemeral + runs_on: linux.arm64.r7g.12xlarge.memory ALPINE_IMAGE: "arm64v8/alpine" build_name: manywheel-py3_11-cuda-aarch64-13_0 build_environment: linux-aarch64-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' timeout-minutes: 420 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} @@ -468,7 +468,7 @@ jobs: DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64 DESIRED_PYTHON: "3.12" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - runs_on: linux.arm64.m7g.4xlarge.ephemeral + runs_on: linux.arm64.r7g.12xlarge.memory ALPINE_IMAGE: "arm64v8/alpine" build_name: manywheel-py3_12-cpu-aarch64 build_environment: linux-aarch64-binary-manywheel @@ -534,11 +534,11 @@ jobs: DOCKER_IMAGE_TAG_PREFIX: cuda12.6 DESIRED_PYTHON: "3.12" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - runs_on: linux.arm64.m7g.4xlarge.ephemeral + runs_on: linux.arm64.r7g.12xlarge.memory ALPINE_IMAGE: "arm64v8/alpine" build_name: manywheel-py3_12-cuda-aarch64-12_6 build_environment: linux-aarch64-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' timeout-minutes: 420 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} @@ -580,11 +580,11 @@ jobs: DOCKER_IMAGE_TAG_PREFIX: cuda12.8 DESIRED_PYTHON: "3.12" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - runs_on: linux.arm64.m7g.4xlarge.ephemeral + runs_on: linux.arm64.r7g.12xlarge.memory ALPINE_IMAGE: "arm64v8/alpine" build_name: manywheel-py3_12-cuda-aarch64-12_8 build_environment: linux-aarch64-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' timeout-minutes: 420 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} @@ -626,11 +626,11 @@ jobs: DOCKER_IMAGE_TAG_PREFIX: cuda13.0 DESIRED_PYTHON: "3.12" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - runs_on: linux.arm64.m7g.4xlarge.ephemeral + runs_on: linux.arm64.r7g.12xlarge.memory ALPINE_IMAGE: "arm64v8/alpine" build_name: manywheel-py3_12-cuda-aarch64-13_0 build_environment: linux-aarch64-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' timeout-minutes: 420 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} @@ -671,7 +671,7 @@ jobs: DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64 DESIRED_PYTHON: "3.13" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - runs_on: linux.arm64.m7g.4xlarge.ephemeral + runs_on: linux.arm64.r7g.12xlarge.memory ALPINE_IMAGE: "arm64v8/alpine" build_name: manywheel-py3_13-cpu-aarch64 build_environment: linux-aarch64-binary-manywheel @@ -737,11 +737,11 @@ jobs: DOCKER_IMAGE_TAG_PREFIX: cuda12.6 DESIRED_PYTHON: "3.13" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - runs_on: linux.arm64.m7g.4xlarge.ephemeral + runs_on: linux.arm64.r7g.12xlarge.memory ALPINE_IMAGE: "arm64v8/alpine" build_name: manywheel-py3_13-cuda-aarch64-12_6 build_environment: linux-aarch64-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' timeout-minutes: 420 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} @@ -783,11 +783,11 @@ jobs: DOCKER_IMAGE_TAG_PREFIX: cuda12.8 DESIRED_PYTHON: "3.13" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - runs_on: linux.arm64.m7g.4xlarge.ephemeral + runs_on: linux.arm64.r7g.12xlarge.memory ALPINE_IMAGE: "arm64v8/alpine" build_name: manywheel-py3_13-cuda-aarch64-12_8 build_environment: linux-aarch64-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' timeout-minutes: 420 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} @@ -829,11 +829,11 @@ jobs: DOCKER_IMAGE_TAG_PREFIX: cuda13.0 DESIRED_PYTHON: "3.13" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - runs_on: linux.arm64.m7g.4xlarge.ephemeral + runs_on: linux.arm64.r7g.12xlarge.memory ALPINE_IMAGE: "arm64v8/alpine" build_name: manywheel-py3_13-cuda-aarch64-13_0 build_environment: linux-aarch64-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' timeout-minutes: 420 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} @@ -874,7 +874,7 @@ jobs: DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64 DESIRED_PYTHON: "3.13t" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - runs_on: linux.arm64.m7g.4xlarge.ephemeral + runs_on: linux.arm64.r7g.12xlarge.memory ALPINE_IMAGE: "arm64v8/alpine" build_name: manywheel-py3_13t-cpu-aarch64 build_environment: linux-aarch64-binary-manywheel @@ -940,11 +940,11 @@ jobs: DOCKER_IMAGE_TAG_PREFIX: cuda12.6 DESIRED_PYTHON: "3.13t" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - runs_on: linux.arm64.m7g.4xlarge.ephemeral + runs_on: linux.arm64.r7g.12xlarge.memory ALPINE_IMAGE: "arm64v8/alpine" build_name: manywheel-py3_13t-cuda-aarch64-12_6 build_environment: linux-aarch64-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' timeout-minutes: 420 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} @@ -986,11 +986,11 @@ jobs: DOCKER_IMAGE_TAG_PREFIX: cuda12.8 DESIRED_PYTHON: "3.13t" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - runs_on: linux.arm64.m7g.4xlarge.ephemeral + runs_on: linux.arm64.r7g.12xlarge.memory ALPINE_IMAGE: "arm64v8/alpine" build_name: manywheel-py3_13t-cuda-aarch64-12_8 build_environment: linux-aarch64-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' timeout-minutes: 420 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} @@ -1032,11 +1032,11 @@ jobs: DOCKER_IMAGE_TAG_PREFIX: cuda13.0 DESIRED_PYTHON: "3.13t" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - runs_on: linux.arm64.m7g.4xlarge.ephemeral + runs_on: linux.arm64.r7g.12xlarge.memory ALPINE_IMAGE: "arm64v8/alpine" build_name: manywheel-py3_13t-cuda-aarch64-13_0 build_environment: linux-aarch64-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' timeout-minutes: 420 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} @@ -1077,7 +1077,7 @@ jobs: DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64 DESIRED_PYTHON: "3.14" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - runs_on: linux.arm64.m7g.4xlarge.ephemeral + runs_on: linux.arm64.r7g.12xlarge.memory ALPINE_IMAGE: "arm64v8/alpine" build_name: manywheel-py3_14-cpu-aarch64 build_environment: linux-aarch64-binary-manywheel @@ -1143,11 +1143,11 @@ jobs: DOCKER_IMAGE_TAG_PREFIX: cuda12.6 DESIRED_PYTHON: "3.14" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - runs_on: linux.arm64.m7g.4xlarge.ephemeral + runs_on: linux.arm64.r7g.12xlarge.memory ALPINE_IMAGE: "arm64v8/alpine" build_name: manywheel-py3_14-cuda-aarch64-12_6 build_environment: linux-aarch64-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' timeout-minutes: 420 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} @@ -1189,11 +1189,11 @@ jobs: DOCKER_IMAGE_TAG_PREFIX: cuda12.8 DESIRED_PYTHON: "3.14" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - runs_on: linux.arm64.m7g.4xlarge.ephemeral + runs_on: linux.arm64.r7g.12xlarge.memory ALPINE_IMAGE: "arm64v8/alpine" build_name: manywheel-py3_14-cuda-aarch64-12_8 build_environment: linux-aarch64-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' timeout-minutes: 420 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} @@ -1235,11 +1235,11 @@ jobs: DOCKER_IMAGE_TAG_PREFIX: cuda13.0 DESIRED_PYTHON: "3.14" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - runs_on: linux.arm64.m7g.4xlarge.ephemeral + runs_on: linux.arm64.r7g.12xlarge.memory ALPINE_IMAGE: "arm64v8/alpine" build_name: manywheel-py3_14-cuda-aarch64-13_0 build_environment: linux-aarch64-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' timeout-minutes: 420 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} @@ -1280,7 +1280,7 @@ jobs: DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64 DESIRED_PYTHON: "3.14t" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - runs_on: linux.arm64.m7g.4xlarge.ephemeral + runs_on: linux.arm64.r7g.12xlarge.memory ALPINE_IMAGE: "arm64v8/alpine" build_name: manywheel-py3_14t-cpu-aarch64 build_environment: linux-aarch64-binary-manywheel @@ -1346,11 +1346,11 @@ jobs: DOCKER_IMAGE_TAG_PREFIX: cuda12.6 DESIRED_PYTHON: "3.14t" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - runs_on: linux.arm64.m7g.4xlarge.ephemeral + runs_on: linux.arm64.r7g.12xlarge.memory ALPINE_IMAGE: "arm64v8/alpine" build_name: manywheel-py3_14t-cuda-aarch64-12_6 build_environment: linux-aarch64-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' timeout-minutes: 420 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} @@ -1392,11 +1392,11 @@ jobs: DOCKER_IMAGE_TAG_PREFIX: cuda12.8 DESIRED_PYTHON: "3.14t" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - runs_on: linux.arm64.m7g.4xlarge.ephemeral + runs_on: linux.arm64.r7g.12xlarge.memory ALPINE_IMAGE: "arm64v8/alpine" build_name: manywheel-py3_14t-cuda-aarch64-12_8 build_environment: linux-aarch64-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' timeout-minutes: 420 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} @@ -1438,11 +1438,11 @@ jobs: DOCKER_IMAGE_TAG_PREFIX: cuda13.0 DESIRED_PYTHON: "3.14t" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - runs_on: linux.arm64.m7g.4xlarge.ephemeral + runs_on: linux.arm64.r7g.12xlarge.memory ALPINE_IMAGE: "arm64v8/alpine" build_name: manywheel-py3_14t-cuda-aarch64-13_0 build_environment: linux-aarch64-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' timeout-minutes: 420 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/generated-linux-binary-manywheel-main.yml b/.github/workflows/generated-linux-binary-manywheel-main.yml index ec08b2c78eb67..96b9f9f739f72 100644 --- a/.github/workflows/generated-linux-binary-manywheel-main.yml +++ b/.github/workflows/generated-linux-binary-manywheel-main.yml @@ -60,7 +60,7 @@ jobs: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build_name: manywheel-py3_12-cuda12_8 build_environment: linux-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_12-cuda12_8-test: # Testing diff --git a/.github/workflows/generated-linux-binary-manywheel-nightly.yml b/.github/workflows/generated-linux-binary-manywheel-nightly.yml index 8a581a1f21fe1..0f87f97df694d 100644 --- a/.github/workflows/generated-linux-binary-manywheel-nightly.yml +++ b/.github/workflows/generated-linux-binary-manywheel-nightly.yml @@ -127,7 +127,7 @@ jobs: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build_name: manywheel-py3_10-cuda12_6 build_environment: linux-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_10-cuda12_6-test: # Testing @@ -193,7 +193,7 @@ jobs: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build_name: manywheel-py3_10-cuda12_8 build_environment: linux-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_10-cuda12_8-test: # Testing @@ -259,7 +259,7 @@ jobs: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build_name: manywheel-py3_10-cuda13_0 build_environment: linux-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_10-cuda13_0-test: # Testing @@ -719,7 +719,7 @@ jobs: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build_name: manywheel-py3_11-cuda12_6 build_environment: linux-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_11-cuda12_6-test: # Testing @@ -785,7 +785,7 @@ jobs: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build_name: manywheel-py3_11-cuda12_8 build_environment: linux-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_11-cuda12_8-test: # Testing @@ -851,7 +851,7 @@ jobs: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build_name: manywheel-py3_11-cuda13_0 build_environment: linux-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_11-cuda13_0-test: # Testing @@ -1311,7 +1311,7 @@ jobs: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build_name: manywheel-py3_12-cuda12_6 build_environment: linux-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_12-cuda12_6-test: # Testing @@ -1377,7 +1377,7 @@ jobs: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build_name: manywheel-py3_12-cuda12_8 build_environment: linux-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_12-cuda12_8-test: # Testing @@ -1443,7 +1443,7 @@ jobs: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build_name: manywheel-py3_12-cuda13_0 build_environment: linux-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_12-cuda13_0-test: # Testing @@ -1903,7 +1903,7 @@ jobs: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build_name: manywheel-py3_13-cuda12_6 build_environment: linux-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_13-cuda12_6-test: # Testing @@ -1969,7 +1969,7 @@ jobs: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build_name: manywheel-py3_13-cuda12_8 build_environment: linux-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_13-cuda12_8-test: # Testing @@ -2035,7 +2035,7 @@ jobs: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build_name: manywheel-py3_13-cuda13_0 build_environment: linux-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_13-cuda13_0-test: # Testing @@ -2495,7 +2495,7 @@ jobs: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build_name: manywheel-py3_13t-cuda12_6 build_environment: linux-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_13t-cuda12_6-test: # Testing @@ -2561,7 +2561,7 @@ jobs: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build_name: manywheel-py3_13t-cuda12_8 build_environment: linux-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_13t-cuda12_8-test: # Testing @@ -2627,7 +2627,7 @@ jobs: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build_name: manywheel-py3_13t-cuda13_0 build_environment: linux-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_13t-cuda13_0-test: # Testing @@ -3087,7 +3087,7 @@ jobs: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build_name: manywheel-py3_14-cuda12_6 build_environment: linux-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_14-cuda12_6-test: # Testing @@ -3153,7 +3153,7 @@ jobs: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build_name: manywheel-py3_14-cuda12_8 build_environment: linux-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_14-cuda12_8-test: # Testing @@ -3219,7 +3219,7 @@ jobs: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build_name: manywheel-py3_14-cuda13_0 build_environment: linux-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_14-cuda13_0-test: # Testing @@ -3679,7 +3679,7 @@ jobs: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build_name: manywheel-py3_14t-cuda12_6 build_environment: linux-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_14t-cuda12_6-test: # Testing @@ -3745,7 +3745,7 @@ jobs: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build_name: manywheel-py3_14t-cuda12_8 build_environment: linux-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_14t-cuda12_8-test: # Testing @@ -3811,7 +3811,7 @@ jobs: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build_name: manywheel-py3_14t-cuda13_0 build_environment: linux-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_14t-cuda13_0-test: # Testing diff --git a/.github/workflows/generated-linux-binary-manywheel-rocm-main.yml b/.github/workflows/generated-linux-binary-manywheel-rocm-main.yml index 8177bac3fe216..18706347026ba 100644 --- a/.github/workflows/generated-linux-binary-manywheel-rocm-main.yml +++ b/.github/workflows/generated-linux-binary-manywheel-rocm-main.yml @@ -44,7 +44,7 @@ jobs: issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} curr_branch: ${{ github.head_ref || github.ref_name }} curr_ref_type: ${{ github.ref_type }} - manywheel-py3_9-rocm6_4-build: + manywheel-py3_10-rocm6_4-build: if: ${{ github.repository_owner == 'pytorch' }} uses: ./.github/workflows/_binary-build-linux.yml needs: get-label-type @@ -58,16 +58,16 @@ jobs: GPU_ARCH_TYPE: rocm DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: rocm6.4 - DESIRED_PYTHON: "3.9" + DESIRED_PYTHON: "3.10" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - build_name: manywheel-py3_9-rocm6_4 + build_name: manywheel-py3_10-rocm6_4 build_environment: linux-binary-manywheel-rocm secrets: github-token: ${{ secrets.GITHUB_TOKEN }} - manywheel-py3_9-rocm6_4-test: # Testing + manywheel-py3_10-rocm6_4-test: # Testing if: ${{ github.repository_owner == 'pytorch' }} needs: - - manywheel-py3_9-rocm6_4-build + - manywheel-py3_10-rocm6_4-build - get-label-type runs-on: linux.rocm.gpu.mi250 timeout-minutes: 240 @@ -82,14 +82,14 @@ jobs: SKIP_ALL_TESTS: 1 DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: rocm6.4 - DESIRED_PYTHON: "3.9" + DESIRED_PYTHON: "3.10" steps: - name: Setup ROCm uses: ./.github/actions/setup-rocm - uses: actions/download-artifact@v4.1.7 name: Download Build Artifacts with: - name: manywheel-py3_9-rocm6_4 + name: manywheel-py3_10-rocm6_4 path: "${{ runner.temp }}/artifacts/" - name: Checkout PyTorch uses: actions/checkout@v4 diff --git a/.github/workflows/generated-macos-arm64-binary-libtorch-release-nightly.yml b/.github/workflows/generated-macos-arm64-binary-libtorch-release-nightly.yml index 500f8fa07af6b..cd912650eb17d 100644 --- a/.github/workflows/generated-macos-arm64-binary-libtorch-release-nightly.yml +++ b/.github/workflows/generated-macos-arm64-binary-libtorch-release-nightly.yml @@ -60,13 +60,13 @@ jobs: echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" # shellcheck disable=SC2129 echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}" - - name: Install conda and dependencies - run: | - # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on - curl --retry 3 --retry-all-errors -o "${RUNNER_TEMP}/conda.sh" "https://repo.anaconda.com/miniconda/Miniconda3-py310_23.5.2-0-MacOSX-$(uname -m).sh" - chmod +x "${RUNNER_TEMP}/conda.sh" - /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda" - echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}" + - name: Setup Python + uses: actions/setup-python@v6 + with: + # TODO: Removeme once 3.14 is out + # .4 version is min minor for 3.10, and also no-gil version of 3.13 needs at least 3.13.3 + python-version: "3.10.4" + freethreaded: false - name: Checkout PyTorch uses: actions/checkout@v4 with: @@ -81,13 +81,9 @@ jobs: working-directory: pytorch - name: Populate binary env run: | - # shellcheck disable=SC1091 - source "${RUNNER_TEMP}/anaconda/bin/activate" "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" - name: Build PyTorch binary run: | - # shellcheck disable=SC1091 - source "${RUNNER_TEMP}/anaconda/bin/activate" set -eux -o pipefail # shellcheck disable=SC1090 source "${BINARY_ENV_FILE:-/Users/distiller/project/env}" diff --git a/.github/workflows/generated-macos-arm64-binary-wheel-nightly.yml b/.github/workflows/generated-macos-arm64-binary-wheel-nightly.yml index 6aee57b503aa2..8522d2d369930 100644 --- a/.github/workflows/generated-macos-arm64-binary-wheel-nightly.yml +++ b/.github/workflows/generated-macos-arm64-binary-wheel-nightly.yml @@ -56,13 +56,13 @@ jobs: echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" # shellcheck disable=SC2129 echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}" - - name: Install conda and dependencies - run: | - # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on - curl --retry 3 --retry-all-errors -o "${RUNNER_TEMP}/conda.sh" "https://repo.anaconda.com/miniconda/Miniconda3-py310_23.5.2-0-MacOSX-$(uname -m).sh" - chmod +x "${RUNNER_TEMP}/conda.sh" - /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda" - echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}" + - name: Setup Python + uses: actions/setup-python@v6 + with: + # TODO: Removeme once 3.14 is out + # .4 version is min minor for 3.10, and also no-gil version of 3.13 needs at least 3.13.3 + python-version: "3.10.4" + freethreaded: false - name: Checkout PyTorch uses: actions/checkout@v4 with: @@ -77,13 +77,9 @@ jobs: working-directory: pytorch - name: Populate binary env run: | - # shellcheck disable=SC1091 - source "${RUNNER_TEMP}/anaconda/bin/activate" "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" - name: Build PyTorch binary run: | - # shellcheck disable=SC1091 - source "${RUNNER_TEMP}/anaconda/bin/activate" set -eux -o pipefail # shellcheck disable=SC1090 source "${BINARY_ENV_FILE:-/Users/distiller/project/env}" @@ -99,8 +95,6 @@ jobs: "${PYTORCH_ROOT}/.ci/wheel/build_wheel.sh" - name: Test PyTorch wheel run: | - # shellcheck disable=SC1091 - source "${RUNNER_TEMP}/anaconda/bin/activate" set -eux -o pipefail # shellcheck disable=SC1090 source "${BINARY_ENV_FILE:-/Users/distiller/project/env}" @@ -111,33 +105,9 @@ jobs: SMOKE_TEST_PARAMS="" - EXTRA_CONDA_INSTALL_FLAGS="" - CONDA_ENV_CREATE_FLAGS="" - # shellcheck disable=SC2153 - case $DESIRED_PYTHON in - 3.14t) - CONDA_ENV_CREATE_FLAGS="python-freethreading" - EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge" - desired_python="3.14.0rc1" - ;; - 3.14) - EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge" - desired_python="3.14.0rc1" - ;; - 3.13t) - CONDA_ENV_CREATE_FLAGS="python-freethreading" - EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge" - desired_python="3.13" - ;; - *) - # shellcheck disable=SC2153 - desired_python=${DESIRED_PYTHON} - ;; - esac - # shellcheck disable=SC2086 - conda create -yn "test_conda_env" python="$desired_python" ${CONDA_ENV_CREATE_FLAGS} ${EXTRA_CONDA_INSTALL_FLAGS} - conda activate test_conda_env + python -mvenv test_venv + source test_venv/bin/activate pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v # shellcheck disable=SC2086 @@ -196,13 +166,13 @@ jobs: echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" # shellcheck disable=SC2129 echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}" - - name: Install conda and dependencies - run: | - # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on - curl --retry 3 --retry-all-errors -o "${RUNNER_TEMP}/conda.sh" "https://repo.anaconda.com/miniconda/Miniconda3-py310_23.5.2-0-MacOSX-$(uname -m).sh" - chmod +x "${RUNNER_TEMP}/conda.sh" - /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda" - echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}" + - name: Setup Python + uses: actions/setup-python@v6 + with: + # TODO: Removeme once 3.14 is out + # .4 version is min minor for 3.10, and also no-gil version of 3.13 needs at least 3.13.3 + python-version: "3.11.4" + freethreaded: false - name: Checkout PyTorch uses: actions/checkout@v4 with: @@ -217,13 +187,9 @@ jobs: working-directory: pytorch - name: Populate binary env run: | - # shellcheck disable=SC1091 - source "${RUNNER_TEMP}/anaconda/bin/activate" "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" - name: Build PyTorch binary run: | - # shellcheck disable=SC1091 - source "${RUNNER_TEMP}/anaconda/bin/activate" set -eux -o pipefail # shellcheck disable=SC1090 source "${BINARY_ENV_FILE:-/Users/distiller/project/env}" @@ -239,8 +205,6 @@ jobs: "${PYTORCH_ROOT}/.ci/wheel/build_wheel.sh" - name: Test PyTorch wheel run: | - # shellcheck disable=SC1091 - source "${RUNNER_TEMP}/anaconda/bin/activate" set -eux -o pipefail # shellcheck disable=SC1090 source "${BINARY_ENV_FILE:-/Users/distiller/project/env}" @@ -251,33 +215,9 @@ jobs: SMOKE_TEST_PARAMS="" - EXTRA_CONDA_INSTALL_FLAGS="" - CONDA_ENV_CREATE_FLAGS="" - # shellcheck disable=SC2153 - case $DESIRED_PYTHON in - 3.14t) - CONDA_ENV_CREATE_FLAGS="python-freethreading" - EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge" - desired_python="3.14.0rc1" - ;; - 3.14) - EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge" - desired_python="3.14.0rc1" - ;; - 3.13t) - CONDA_ENV_CREATE_FLAGS="python-freethreading" - EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge" - desired_python="3.13" - ;; - *) - # shellcheck disable=SC2153 - desired_python=${DESIRED_PYTHON} - ;; - esac - # shellcheck disable=SC2086 - conda create -yn "test_conda_env" python="$desired_python" ${CONDA_ENV_CREATE_FLAGS} ${EXTRA_CONDA_INSTALL_FLAGS} - conda activate test_conda_env + python -mvenv test_venv + source test_venv/bin/activate pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v # shellcheck disable=SC2086 @@ -336,13 +276,13 @@ jobs: echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" # shellcheck disable=SC2129 echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}" - - name: Install conda and dependencies - run: | - # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on - curl --retry 3 --retry-all-errors -o "${RUNNER_TEMP}/conda.sh" "https://repo.anaconda.com/miniconda/Miniconda3-py310_23.5.2-0-MacOSX-$(uname -m).sh" - chmod +x "${RUNNER_TEMP}/conda.sh" - /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda" - echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}" + - name: Setup Python + uses: actions/setup-python@v6 + with: + # TODO: Removeme once 3.14 is out + # .4 version is min minor for 3.10, and also no-gil version of 3.13 needs at least 3.13.3 + python-version: "3.12.4" + freethreaded: false - name: Checkout PyTorch uses: actions/checkout@v4 with: @@ -357,13 +297,9 @@ jobs: working-directory: pytorch - name: Populate binary env run: | - # shellcheck disable=SC1091 - source "${RUNNER_TEMP}/anaconda/bin/activate" "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" - name: Build PyTorch binary run: | - # shellcheck disable=SC1091 - source "${RUNNER_TEMP}/anaconda/bin/activate" set -eux -o pipefail # shellcheck disable=SC1090 source "${BINARY_ENV_FILE:-/Users/distiller/project/env}" @@ -379,8 +315,6 @@ jobs: "${PYTORCH_ROOT}/.ci/wheel/build_wheel.sh" - name: Test PyTorch wheel run: | - # shellcheck disable=SC1091 - source "${RUNNER_TEMP}/anaconda/bin/activate" set -eux -o pipefail # shellcheck disable=SC1090 source "${BINARY_ENV_FILE:-/Users/distiller/project/env}" @@ -391,33 +325,9 @@ jobs: SMOKE_TEST_PARAMS="" - EXTRA_CONDA_INSTALL_FLAGS="" - CONDA_ENV_CREATE_FLAGS="" - # shellcheck disable=SC2153 - case $DESIRED_PYTHON in - 3.14t) - CONDA_ENV_CREATE_FLAGS="python-freethreading" - EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge" - desired_python="3.14.0rc1" - ;; - 3.14) - EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge" - desired_python="3.14.0rc1" - ;; - 3.13t) - CONDA_ENV_CREATE_FLAGS="python-freethreading" - EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge" - desired_python="3.13" - ;; - *) - # shellcheck disable=SC2153 - desired_python=${DESIRED_PYTHON} - ;; - esac - # shellcheck disable=SC2086 - conda create -yn "test_conda_env" python="$desired_python" ${CONDA_ENV_CREATE_FLAGS} ${EXTRA_CONDA_INSTALL_FLAGS} - conda activate test_conda_env + python -mvenv test_venv + source test_venv/bin/activate pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v # shellcheck disable=SC2086 @@ -476,13 +386,13 @@ jobs: echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" # shellcheck disable=SC2129 echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}" - - name: Install conda and dependencies - run: | - # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on - curl --retry 3 --retry-all-errors -o "${RUNNER_TEMP}/conda.sh" "https://repo.anaconda.com/miniconda/Miniconda3-py310_23.5.2-0-MacOSX-$(uname -m).sh" - chmod +x "${RUNNER_TEMP}/conda.sh" - /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda" - echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}" + - name: Setup Python + uses: actions/setup-python@v6 + with: + # TODO: Removeme once 3.14 is out + # .4 version is min minor for 3.10, and also no-gil version of 3.13 needs at least 3.13.3 + python-version: "3.13.4" + freethreaded: false - name: Checkout PyTorch uses: actions/checkout@v4 with: @@ -497,13 +407,9 @@ jobs: working-directory: pytorch - name: Populate binary env run: | - # shellcheck disable=SC1091 - source "${RUNNER_TEMP}/anaconda/bin/activate" "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" - name: Build PyTorch binary run: | - # shellcheck disable=SC1091 - source "${RUNNER_TEMP}/anaconda/bin/activate" set -eux -o pipefail # shellcheck disable=SC1090 source "${BINARY_ENV_FILE:-/Users/distiller/project/env}" @@ -519,8 +425,6 @@ jobs: "${PYTORCH_ROOT}/.ci/wheel/build_wheel.sh" - name: Test PyTorch wheel run: | - # shellcheck disable=SC1091 - source "${RUNNER_TEMP}/anaconda/bin/activate" set -eux -o pipefail # shellcheck disable=SC1090 source "${BINARY_ENV_FILE:-/Users/distiller/project/env}" @@ -531,33 +435,9 @@ jobs: SMOKE_TEST_PARAMS="" - EXTRA_CONDA_INSTALL_FLAGS="" - CONDA_ENV_CREATE_FLAGS="" - # shellcheck disable=SC2153 - case $DESIRED_PYTHON in - 3.14t) - CONDA_ENV_CREATE_FLAGS="python-freethreading" - EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge" - desired_python="3.14.0rc1" - ;; - 3.14) - EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge" - desired_python="3.14.0rc1" - ;; - 3.13t) - CONDA_ENV_CREATE_FLAGS="python-freethreading" - EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge" - desired_python="3.13" - ;; - *) - # shellcheck disable=SC2153 - desired_python=${DESIRED_PYTHON} - ;; - esac - # shellcheck disable=SC2086 - conda create -yn "test_conda_env" python="$desired_python" ${CONDA_ENV_CREATE_FLAGS} ${EXTRA_CONDA_INSTALL_FLAGS} - conda activate test_conda_env + python -mvenv test_venv + source test_venv/bin/activate pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v # shellcheck disable=SC2086 @@ -616,13 +496,13 @@ jobs: echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" # shellcheck disable=SC2129 echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}" - - name: Install conda and dependencies - run: | - # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on - curl --retry 3 --retry-all-errors -o "${RUNNER_TEMP}/conda.sh" "https://repo.anaconda.com/miniconda/Miniconda3-py310_23.5.2-0-MacOSX-$(uname -m).sh" - chmod +x "${RUNNER_TEMP}/conda.sh" - /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda" - echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}" + - name: Setup Python + uses: actions/setup-python@v6 + with: + # TODO: Removeme once 3.14 is out + # .4 version is min minor for 3.10, and also no-gil version of 3.13 needs at least 3.13.3 + python-version: "3.13.4" + freethreaded: true - name: Checkout PyTorch uses: actions/checkout@v4 with: @@ -637,13 +517,9 @@ jobs: working-directory: pytorch - name: Populate binary env run: | - # shellcheck disable=SC1091 - source "${RUNNER_TEMP}/anaconda/bin/activate" "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" - name: Build PyTorch binary run: | - # shellcheck disable=SC1091 - source "${RUNNER_TEMP}/anaconda/bin/activate" set -eux -o pipefail # shellcheck disable=SC1090 source "${BINARY_ENV_FILE:-/Users/distiller/project/env}" @@ -659,8 +535,6 @@ jobs: "${PYTORCH_ROOT}/.ci/wheel/build_wheel.sh" - name: Test PyTorch wheel run: | - # shellcheck disable=SC1091 - source "${RUNNER_TEMP}/anaconda/bin/activate" set -eux -o pipefail # shellcheck disable=SC1090 source "${BINARY_ENV_FILE:-/Users/distiller/project/env}" @@ -671,33 +545,9 @@ jobs: SMOKE_TEST_PARAMS="" - EXTRA_CONDA_INSTALL_FLAGS="" - CONDA_ENV_CREATE_FLAGS="" - # shellcheck disable=SC2153 - case $DESIRED_PYTHON in - 3.14t) - CONDA_ENV_CREATE_FLAGS="python-freethreading" - EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge" - desired_python="3.14.0rc1" - ;; - 3.14) - EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge" - desired_python="3.14.0rc1" - ;; - 3.13t) - CONDA_ENV_CREATE_FLAGS="python-freethreading" - EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge" - desired_python="3.13" - ;; - *) - # shellcheck disable=SC2153 - desired_python=${DESIRED_PYTHON} - ;; - esac - # shellcheck disable=SC2086 - conda create -yn "test_conda_env" python="$desired_python" ${CONDA_ENV_CREATE_FLAGS} ${EXTRA_CONDA_INSTALL_FLAGS} - conda activate test_conda_env + python -mvenv test_venv + source test_venv/bin/activate pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v # shellcheck disable=SC2086 @@ -756,13 +606,13 @@ jobs: echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" # shellcheck disable=SC2129 echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}" - - name: Install conda and dependencies - run: | - # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on - curl --retry 3 --retry-all-errors -o "${RUNNER_TEMP}/conda.sh" "https://repo.anaconda.com/miniconda/Miniconda3-py310_23.5.2-0-MacOSX-$(uname -m).sh" - chmod +x "${RUNNER_TEMP}/conda.sh" - /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda" - echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}" + - name: Setup Python + uses: actions/setup-python@v6 + with: + # TODO: Removeme once 3.14 is out + # .4 version is min minor for 3.10, and also no-gil version of 3.13 needs at least 3.13.3 + python-version: "3.14.0-rc.2" + freethreaded: false - name: Checkout PyTorch uses: actions/checkout@v4 with: @@ -777,13 +627,9 @@ jobs: working-directory: pytorch - name: Populate binary env run: | - # shellcheck disable=SC1091 - source "${RUNNER_TEMP}/anaconda/bin/activate" "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" - name: Build PyTorch binary run: | - # shellcheck disable=SC1091 - source "${RUNNER_TEMP}/anaconda/bin/activate" set -eux -o pipefail # shellcheck disable=SC1090 source "${BINARY_ENV_FILE:-/Users/distiller/project/env}" @@ -799,8 +645,6 @@ jobs: "${PYTORCH_ROOT}/.ci/wheel/build_wheel.sh" - name: Test PyTorch wheel run: | - # shellcheck disable=SC1091 - source "${RUNNER_TEMP}/anaconda/bin/activate" set -eux -o pipefail # shellcheck disable=SC1090 source "${BINARY_ENV_FILE:-/Users/distiller/project/env}" @@ -811,33 +655,9 @@ jobs: SMOKE_TEST_PARAMS="" - EXTRA_CONDA_INSTALL_FLAGS="" - CONDA_ENV_CREATE_FLAGS="" - # shellcheck disable=SC2153 - case $DESIRED_PYTHON in - 3.14t) - CONDA_ENV_CREATE_FLAGS="python-freethreading" - EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge" - desired_python="3.14.0rc1" - ;; - 3.14) - EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge" - desired_python="3.14.0rc1" - ;; - 3.13t) - CONDA_ENV_CREATE_FLAGS="python-freethreading" - EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge" - desired_python="3.13" - ;; - *) - # shellcheck disable=SC2153 - desired_python=${DESIRED_PYTHON} - ;; - esac - # shellcheck disable=SC2086 - conda create -yn "test_conda_env" python="$desired_python" ${CONDA_ENV_CREATE_FLAGS} ${EXTRA_CONDA_INSTALL_FLAGS} - conda activate test_conda_env + python -mvenv test_venv + source test_venv/bin/activate pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v # shellcheck disable=SC2086 @@ -896,13 +716,13 @@ jobs: echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" # shellcheck disable=SC2129 echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}" - - name: Install conda and dependencies - run: | - # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on - curl --retry 3 --retry-all-errors -o "${RUNNER_TEMP}/conda.sh" "https://repo.anaconda.com/miniconda/Miniconda3-py310_23.5.2-0-MacOSX-$(uname -m).sh" - chmod +x "${RUNNER_TEMP}/conda.sh" - /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda" - echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}" + - name: Setup Python + uses: actions/setup-python@v6 + with: + # TODO: Removeme once 3.14 is out + # .4 version is min minor for 3.10, and also no-gil version of 3.13 needs at least 3.13.3 + python-version: "3.14.0-rc.2" + freethreaded: true - name: Checkout PyTorch uses: actions/checkout@v4 with: @@ -917,13 +737,9 @@ jobs: working-directory: pytorch - name: Populate binary env run: | - # shellcheck disable=SC1091 - source "${RUNNER_TEMP}/anaconda/bin/activate" "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" - name: Build PyTorch binary run: | - # shellcheck disable=SC1091 - source "${RUNNER_TEMP}/anaconda/bin/activate" set -eux -o pipefail # shellcheck disable=SC1090 source "${BINARY_ENV_FILE:-/Users/distiller/project/env}" @@ -939,8 +755,6 @@ jobs: "${PYTORCH_ROOT}/.ci/wheel/build_wheel.sh" - name: Test PyTorch wheel run: | - # shellcheck disable=SC1091 - source "${RUNNER_TEMP}/anaconda/bin/activate" set -eux -o pipefail # shellcheck disable=SC1090 source "${BINARY_ENV_FILE:-/Users/distiller/project/env}" @@ -951,33 +765,9 @@ jobs: SMOKE_TEST_PARAMS="" - EXTRA_CONDA_INSTALL_FLAGS="" - CONDA_ENV_CREATE_FLAGS="" - # shellcheck disable=SC2153 - case $DESIRED_PYTHON in - 3.14t) - CONDA_ENV_CREATE_FLAGS="python-freethreading" - EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge" - desired_python="3.14.0rc1" - ;; - 3.14) - EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge" - desired_python="3.14.0rc1" - ;; - 3.13t) - CONDA_ENV_CREATE_FLAGS="python-freethreading" - EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge" - desired_python="3.13" - ;; - *) - # shellcheck disable=SC2153 - desired_python=${DESIRED_PYTHON} - ;; - esac - # shellcheck disable=SC2086 - conda create -yn "test_conda_env" python="$desired_python" ${CONDA_ENV_CREATE_FLAGS} ${EXTRA_CONDA_INSTALL_FLAGS} - conda activate test_conda_env + python -mvenv test_venv + source test_venv/bin/activate pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v # shellcheck disable=SC2086 diff --git a/.github/workflows/inductor-nightly.yml b/.github/workflows/inductor-nightly.yml index fe0f102406b6a..78602e05586b7 100644 --- a/.github/workflows/inductor-nightly.yml +++ b/.github/workflows/inductor-nightly.yml @@ -37,7 +37,7 @@ jobs: uses: ./.github/workflows/_linux-build.yml needs: get-default-label-prefix with: - build-environment: linux-jammy-py3.9-gcc11-build + build-environment: linux-jammy-py3.10-gcc11-build docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}" test-matrix: | @@ -56,7 +56,7 @@ jobs: uses: ./.github/workflows/_linux-test.yml needs: nightly-dynamo-benchmarks-build with: - build-environment: linux-jammy-py3.9-gcc11-build + build-environment: linux-jammy-py3.10-gcc11-build docker-image: ${{ needs.nightly-dynamo-benchmarks-build.outputs.docker-image }} test-matrix: ${{ needs.nightly-dynamo-benchmarks-build.outputs.test-matrix }} timeout-minutes: 720 diff --git a/.github/workflows/inductor-perf-test-nightly-x86-zen.yml b/.github/workflows/inductor-perf-test-nightly-x86-zen.yml index 170de752ab875..a7110b0fd9328 100644 --- a/.github/workflows/inductor-perf-test-nightly-x86-zen.yml +++ b/.github/workflows/inductor-perf-test-nightly-x86-zen.yml @@ -43,6 +43,11 @@ on: required: false type: boolean default: false + freezing: + description: Run freezing? + required: false + type: boolean + default: true benchmark_configs: description: The list of configs used the benchmark required: false @@ -75,7 +80,7 @@ jobs: needs: get-label-type with: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - build-environment: linux-jammy-py3.9-gcc11-build + build-environment: linux-jammy-py3.10-gcc11-build docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks test-matrix: | { include: [ @@ -101,8 +106,8 @@ jobs: needs: inductor-build if: github.event.schedule == '0 7 * * *' with: - build-environment: linux-jammy-py3.9-gcc11-build - dashboard-tag: training-false-inference-true-default-true-dynamic-true-cppwrapper-true-aotinductor-true + build-environment: linux-jammy-py3.10-gcc11-build + dashboard-tag: training-false-inference-true-default-true-dynamic-true-cppwrapper-true-aotinductor-true-freezing-true docker-image: ${{ needs.inductor-build.outputs.docker-image }} test-matrix: ${{ needs.inductor-build.outputs.test-matrix }} timeout-minutes: 720 @@ -116,10 +121,9 @@ jobs: name: inductor-test uses: ./.github/workflows/_linux-test.yml needs: inductor-build - if: github.event_name == 'workflow_dispatch' with: - build-environment: linux-jammy-py3.9-gcc11-build - dashboard-tag: training-${{ inputs.training }}-inference-${{ inputs.inference }}-default-${{ inputs.default }}-dynamic-${{ inputs.dynamic }}-cppwrapper-${{ inputs.cppwrapper }}-aotinductor-${{ inputs.aotinductor }} + build-environment: linux-jammy-py3.10-gcc11-build + dashboard-tag: training-${{ inputs.training || 'false' }}-inference-${{ inputs.inference || 'true' }}-default-${{ inputs.default || 'true' }}-dynamic-${{ inputs.dynamic || 'true' }}-cppwrapper-${{ inputs.cppwrapper || 'true' }}-aotinductor-${{ inputs.aotinductor || 'true' }}-freezing-${{ inputs.freezing || 'true' }} docker-image: ${{ needs.inductor-build.outputs.docker-image }} test-matrix: ${{ needs.inductor-build.outputs.test-matrix }} timeout-minutes: 720 diff --git a/.github/workflows/inductor-perf-test-nightly-x86.yml b/.github/workflows/inductor-perf-test-nightly-x86.yml index f894b8fdc6e03..0533184df2e0e 100644 --- a/.github/workflows/inductor-perf-test-nightly-x86.yml +++ b/.github/workflows/inductor-perf-test-nightly-x86.yml @@ -80,7 +80,7 @@ jobs: needs: get-label-type with: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - build-environment: linux-jammy-py3.9-gcc11-build + build-environment: linux-jammy-py3.10-gcc11-build docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks test-matrix: | { include: [ @@ -107,7 +107,7 @@ jobs: needs: inductor-build if: github.event.schedule == '0 7 * * *' with: - build-environment: linux-jammy-py3.9-gcc11-build + build-environment: linux-jammy-py3.10-gcc11-build dashboard-tag: training-false-inference-true-default-true-dynamic-true-cppwrapper-true-aotinductor-true-freezing-true docker-image: ${{ needs.inductor-build.outputs.docker-image }} test-matrix: ${{ needs.inductor-build.outputs.test-matrix }} @@ -124,7 +124,7 @@ jobs: needs: inductor-build if: github.event_name == 'workflow_dispatch' with: - build-environment: linux-jammy-py3.9-gcc11-build + build-environment: linux-jammy-py3.10-gcc11-build dashboard-tag: training-${{ inputs.training }}-inference-${{ inputs.inference }}-default-${{ inputs.default }}-dynamic-${{ inputs.dynamic }}-cppwrapper-${{ inputs.cppwrapper }}-aotinductor-${{ inputs.aotinductor }}-freezing-${{ inputs.freezing }} docker-image: ${{ needs.inductor-build.outputs.docker-image }} test-matrix: ${{ needs.inductor-build.outputs.test-matrix }} diff --git a/.github/workflows/inductor-periodic.yml b/.github/workflows/inductor-periodic.yml index 21d965eaeaada..454cd166c90bb 100644 --- a/.github/workflows/inductor-periodic.yml +++ b/.github/workflows/inductor-periodic.yml @@ -39,7 +39,7 @@ jobs: runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}" build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm86 docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks - cuda-arch-list: '8.6' + cuda-arch-list: '8.0;8.6' test-matrix: | { include: [ { config: "dynamo_eager_torchbench", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" }, @@ -62,7 +62,7 @@ jobs: { config: "dynamic_inductor_timm", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" }, { config: "dynamic_inductor_torchbench", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" }, { config: "dynamic_inductor_torchbench", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" }, - { config: "aot_inductor_huggingface", shard: 1, num_shards: 1, runner: "linux.g5.4xlarge.nvidia.gpu" }, + { config: "aot_inductor_huggingface", shard: 1, num_shards: 1, runner: "linux.aws.a100" }, { config: "aot_inductor_timm", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" }, { config: "aot_inductor_timm", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" }, { config: "aot_inductor_torchbench", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" }, @@ -154,7 +154,7 @@ jobs: uses: ./.github/workflows/_linux-build.yml needs: get-default-label-prefix with: - build-environment: linux-jammy-py3.9-gcc11-build + build-environment: linux-jammy-py3.10-gcc11-build docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}" test-matrix: | @@ -200,7 +200,7 @@ jobs: uses: ./.github/workflows/_linux-test.yml needs: periodic-dynamo-benchmarks-cpu-build with: - build-environment: linux-jammy-py3.9-gcc11-build + build-environment: linux-jammy-py3.10-gcc11-build docker-image: ${{ needs.periodic-dynamo-benchmarks-cpu-build.outputs.docker-image }} test-matrix: ${{ needs.periodic-dynamo-benchmarks-cpu-build.outputs.test-matrix }} secrets: inherit diff --git a/.github/workflows/inductor-unittest.yml b/.github/workflows/inductor-unittest.yml index 2125a8559363b..6ab276a57fc4d 100644 --- a/.github/workflows/inductor-unittest.yml +++ b/.github/workflows/inductor-unittest.yml @@ -110,7 +110,7 @@ jobs: uses: ./.github/workflows/_linux-build.yml needs: get-label-type with: - build-environment: linux-jammy-py3.9-gcc11-build + build-environment: linux-jammy-py3.10-gcc11-build docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" test-matrix: | @@ -127,7 +127,7 @@ jobs: uses: ./.github/workflows/_linux-test.yml needs: inductor-cpu-build with: - build-environment: linux-jammy-py3.9-gcc11-build + build-environment: linux-jammy-py3.10-gcc11-build docker-image: ${{ needs.inductor-cpu-build.outputs.docker-image }} test-matrix: ${{ needs.inductor-cpu-build.outputs.test-matrix }} secrets: inherit diff --git a/.github/workflows/inductor.yml b/.github/workflows/inductor.yml index 4189d24a7b14f..2616141c0dc2a 100644 --- a/.github/workflows/inductor.yml +++ b/.github/workflows/inductor.yml @@ -79,7 +79,7 @@ jobs: uses: ./.github/workflows/_linux-build.yml needs: get-label-type with: - build-environment: linux-jammy-py3.9-gcc11-build + build-environment: linux-jammy-py3.10-gcc11-build docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" test-matrix: | @@ -101,7 +101,7 @@ jobs: uses: ./.github/workflows/_linux-test.yml needs: inductor-cpu-build with: - build-environment: linux-jammy-py3.9-gcc11-build + build-environment: linux-jammy-py3.10-gcc11-build docker-image: ${{ needs.inductor-cpu-build.outputs.docker-image }} test-matrix: ${{ needs.inductor-cpu-build.outputs.test-matrix }} secrets: inherit diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index b1a6dfb390711..80f78b01c9808 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -31,6 +31,8 @@ jobs: if: github.repository_owner == 'pytorch' name: Get changed files uses: ./.github/workflows/_get-changed-files.yml + with: + all_files: ${{ contains(github.event.pull_request.labels.*.name, 'lint-all-files') || contains(github.event.pull_request.labels.*.name, 'Reverted') }} lintrunner-clang: uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main @@ -53,7 +55,7 @@ jobs: with: timeout: 120 runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" - docker-image: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3.9-linter + docker-image: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3.10-linter # NB: A shallow checkout won't work here because calculate-docker-image requires a full checkout # to run git rev-parse HEAD~:.ci/docker when a new image is needed fetch-depth: 0 @@ -264,10 +266,10 @@ jobs: with: submodules: false fetch-depth: 1 - - name: Setup Python 3.9 + - name: Setup Python 3.10 uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0 with: - python-version: '3.9' + python-version: '3.10' architecture: x64 cache: pip - name: Install dependencies diff --git a/.github/workflows/nightly.yml b/.github/workflows/nightly.yml index 65b8781be7585..696c5b68b475b 100644 --- a/.github/workflows/nightly.yml +++ b/.github/workflows/nightly.yml @@ -54,7 +54,7 @@ jobs: - get-label-type with: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - build-environment: linux-jammy-py3.9-gcc11 + build-environment: linux-jammy-py3.10-gcc11 docker-image: ${{ needs.docs-build.outputs.docker-image }} push: ${{ github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' || startsWith(github.event.ref, 'refs/tags/v') }} run-doxygen: true diff --git a/.github/workflows/operator_benchmark.yml b/.github/workflows/operator_benchmark.yml index aaf32c160f0dc..dcdc2cd0ba24e 100644 --- a/.github/workflows/operator_benchmark.yml +++ b/.github/workflows/operator_benchmark.yml @@ -14,6 +14,10 @@ on: schedule: # Run at 07:00 UTC every Sunday - cron: 0 7 * * 0 + pull_request: + paths: + - benchmarks/operator_benchmark/** + - .github/workflows/operator_benchmark.yml concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} @@ -29,7 +33,7 @@ jobs: name: opbenchmark-build uses: ./.github/workflows/_linux-build.yml with: - build-environment: linux-jammy-py3.9-gcc11-build + build-environment: linux-jammy-py3.10-gcc11-build docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks test-matrix: | { include: [ @@ -42,7 +46,7 @@ jobs: name: opbenchmark-on-demand-build uses: ./.github/workflows/_linux-build.yml with: - build-environment: linux-jammy-py3.9-gcc11-build + build-environment: linux-jammy-py3.10-gcc11-build docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks test-matrix: | { include: [ @@ -55,7 +59,7 @@ jobs: uses: ./.github/workflows/_linux-test.yml needs: opbenchmark-build with: - build-environment: linux-jammy-py3.9-gcc11-build + build-environment: linux-jammy-py3.10-gcc11-build docker-image: ${{ needs.opbenchmark-build.outputs.docker-image }} test-matrix: ${{ needs.opbenchmark-build.outputs.test-matrix }} secrets: inherit diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml index 3f13fbf276882..e0e1065c5aba0 100644 --- a/.github/workflows/pull.yml +++ b/.github/workflows/pull.yml @@ -316,32 +316,6 @@ jobs: ]} secrets: inherit - linux-jammy-py3-clang12-executorch-build: - if: false # Docker build needs pin update - name: linux-jammy-py3-clang12-executorch - uses: ./.github/workflows/_linux-build.yml - needs: get-label-type - with: - runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - build-environment: linux-jammy-py3-clang12-executorch - docker-image-name: ci-image:pytorch-linux-jammy-py3-clang12-executorch - test-matrix: | - { include: [ - { config: "executorch", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" }, - ]} - secrets: inherit - - linux-jammy-py3-clang12-executorch-test: - name: linux-jammy-py3-clang12-executorch - uses: ./.github/workflows/_linux-test.yml - needs: linux-jammy-py3-clang12-executorch-build - if: false # Has been broken for a while - with: - build-environment: linux-jammy-py3-clang12-executorch - docker-image: ${{ needs.linux-jammy-py3-clang12-executorch-build.outputs.docker-image }} - test-matrix: ${{ needs.linux-jammy-py3-clang12-executorch-build.outputs.test-matrix }} - secrets: inherit - linux-jammy-cuda12_8-py3_10-gcc9-inductor-build: name: cuda12.8-py3.10-gcc9-sm75 uses: ./.github/workflows/_linux-build.yml diff --git a/.github/workflows/quantization-periodic.yml b/.github/workflows/quantization-periodic.yml new file mode 100644 index 0000000000000..688f557eaf0e4 --- /dev/null +++ b/.github/workflows/quantization-periodic.yml @@ -0,0 +1,54 @@ +name: quantization-periodic + +on: + push: + tags: + - ciflow/quantization-periodic/* + workflow_dispatch: + schedule: + # run weekly + - cron: "45 0 * * 0" + +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }} + cancel-in-progress: true + +permissions: + id-token: write + contents: read + +jobs: + get-default-label-prefix: + name: get-default-label-prefix + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main + if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }} + with: + triggering_actor: ${{ github.triggering_actor }} + issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} + curr_branch: ${{ github.head_ref || github.ref_name }} + curr_ref_type: ${{ github.ref_type }} + opt_out_experiments: lf + + periodic-quantization-build: + name: periodic-quantization-build + uses: ./.github/workflows/_linux-build.yml + needs: get-default-label-prefix + with: + runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}" + build-environment: linux-jammy-cuda12.8-cudnn9-py3-gcc11 + docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11 + cuda-arch-list: '8.9' + test-matrix: | + { include: [ + { config: "quantization", shard: 1, num_shards: 1, runner: "${{ needs.get-default-label-prefix.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" }, + ]} + secrets: inherit + periodic-test-quantization: + name: periodic-test-quantization + uses: ./.github/workflows/_linux-test.yml + needs: periodic-quantization-build + with: + build-environment: linux-jammy-cuda12.8-cudnn9-py3-gcc11 + docker-image: ${{ needs.periodic-quantization-build.outputs.docker-image }} + test-matrix: ${{ needs.periodic-quantization-build.outputs.test-matrix }} + secrets: inherit diff --git a/.github/workflows/test-b200.yml b/.github/workflows/test-b200.yml new file mode 100644 index 0000000000000..ef7f75bc4b2b4 --- /dev/null +++ b/.github/workflows/test-b200.yml @@ -0,0 +1,76 @@ +# B200 Smoke Tests CI Workflow +# +# This workflow runs smoke tests on B200 hardware +# +# Flow: +# 1. Builds PyTorch with CUDA 12.8+ and sm100 architecture for B200 +# 2. Runs smoke tests on linux.dgx.b200 runner +# 3. Tests executed are defined in .ci/pytorch/test.sh -> test_python_smoke() function +# +# Triggered by: +# - Pull requests modifying this workflow file +# - Manual dispatch +# - Schedule (every 6 hours) +# - Adding ciflow/b200 label to a PR (creates ciflow/b200/* tag) + +name: B200 Smoke Tests + +on: + pull_request: + paths: + - .github/workflows/test-b200.yml + workflow_dispatch: + schedule: + - cron: 0 4,10,16,22 * * * # every 6 hours + push: + tags: + - ciflow/b200/* + +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }} + cancel-in-progress: true + +permissions: + id-token: write + contents: read + +jobs: + + get-label-type: + if: github.repository_owner == 'pytorch' + name: get-label-type + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main + with: + triggering_actor: ${{ github.triggering_actor }} + issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} + curr_branch: ${{ github.head_ref || github.ref_name }} + curr_ref_type: ${{ github.ref_type }} + + linux-jammy-cuda12_8-py3_10-gcc11-sm100-build: + name: linux-jammy-cuda12.8-py3.10-gcc11-sm100 + uses: ./.github/workflows/_linux-build.yml + needs: get-label-type + with: + runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + runner: linux.12xlarge.memory + build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm100 + docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11 + cuda-arch-list: '10.0' + test-matrix: | + { include: [ + { config: "smoke_b200", shard: 1, num_shards: 1, runner: "linux.dgx.b200" }, + ]} + # config: "smoke_b200" maps to test_python_smoke_b200() in .ci/pytorch/test.sh + secrets: inherit + + linux-jammy-cuda12_8-py3_10-gcc11-sm100-test: + name: linux-jammy-cuda12.8-py3.10-gcc11-sm100 + uses: ./.github/workflows/_linux-test.yml + needs: + - linux-jammy-cuda12_8-py3_10-gcc11-sm100-build + with: + build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm100 + docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-sm100-build.outputs.docker-image }} + test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-sm100-build.outputs.test-matrix }} + aws-role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only + secrets: inherit \ No newline at end of file diff --git a/.github/workflows/trunk.yml b/.github/workflows/trunk.yml index 4dd465d70803d..0140c2d3c00cb 100644 --- a/.github/workflows/trunk.yml +++ b/.github/workflows/trunk.yml @@ -240,7 +240,7 @@ jobs: needs: get-label-type with: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - build-environment: linux-jammy-py3.9-gcc11 + build-environment: linux-jammy-py3.10-gcc11 docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks test-matrix: | { include: [ @@ -255,7 +255,31 @@ jobs: - verify-cachebench-cpu-build - target-determination with: - build-environment: linux-jammy-py3.9-gcc11 + build-environment: linux-jammy-py3.10-gcc11 docker-image: ${{ needs.verify-cachebench-cpu-build.outputs.docker-image }} test-matrix: ${{ needs.verify-cachebench-cpu-build.outputs.test-matrix }} secrets: inherit + + linux-jammy-py3-clang12-executorch-build: + name: linux-jammy-py3-clang12-executorch + uses: ./.github/workflows/_linux-build.yml + needs: get-label-type + with: + runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + build-environment: linux-jammy-py3-clang12-executorch + docker-image-name: ci-image:pytorch-linux-jammy-py3-clang12-executorch + test-matrix: | + { include: [ + { config: "executorch", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" }, + ]} + secrets: inherit + + linux-jammy-py3-clang12-executorch-test: + name: linux-jammy-py3-clang12-executorch + uses: ./.github/workflows/_linux-test.yml + needs: linux-jammy-py3-clang12-executorch-build + with: + build-environment: linux-jammy-py3-clang12-executorch + docker-image: ${{ needs.linux-jammy-py3-clang12-executorch-build.outputs.docker-image }} + test-matrix: ${{ needs.linux-jammy-py3-clang12-executorch-build.outputs.test-matrix }} + secrets: inherit diff --git a/.github/workflows/unstable.yml b/.github/workflows/unstable.yml index 7f0fe6058bd08..b5955127d9fb3 100644 --- a/.github/workflows/unstable.yml +++ b/.github/workflows/unstable.yml @@ -53,27 +53,3 @@ jobs: issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} curr_branch: ${{ github.head_ref || github.ref_name }} curr_ref_type: ${{ github.ref_type }} - - linux-jammy-py3_9-clang9-xla-build: - name: linux-jammy-py3_9-clang9-xla - uses: ./.github/workflows/_linux-build.yml - needs: get-label-type - with: - runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - build-environment: linux-jammy-py3.9-clang9-xla - docker-image-name: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/xla_base:v1.3-lite - test-matrix: | - { include: [ - { config: "xla", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.12xlarge" }, - ]} - secrets: inherit - - linux-jammy-py3_9-clang9-xla-test: - name: linux-jammy-py3_9-clang9-xla - uses: ./.github/workflows/_linux-test.yml - needs: linux-jammy-py3_9-clang9-xla-build - with: - build-environment: linux-jammy-py3.9-clang9-xla - docker-image: ${{ needs.linux-jammy-py3_9-clang9-xla-build.outputs.docker-image }} - test-matrix: ${{ needs.linux-jammy-py3_9-clang9-xla-build.outputs.test-matrix }} - secrets: inherit diff --git a/.github/workflows/vllm.yml b/.github/workflows/vllm.yml index 654e88be386b6..b2768a8f767e2 100644 --- a/.github/workflows/vllm.yml +++ b/.github/workflows/vllm.yml @@ -36,6 +36,8 @@ jobs: uses: ./.github/workflows/_linux-build.yml needs: get-label-type with: + # When building vLLM, uv doesn't like that we rename wheel without changing the wheel metadata + allow-reuse-old-whl: false build-additional-packages: "vision audio" build-external-packages: "vllm" build-environment: linux-jammy-cuda12.8-py3.12-gcc11 diff --git a/.gitignore b/.gitignore index d1fa4cd3caf28..ca87f1306e125 100644 --- a/.gitignore +++ b/.gitignore @@ -259,6 +259,9 @@ gen .pytest_cache aten/build/* +# Linker scripts for prioritized text optimization +cmake/linker_script.ld + # Bram plsdontbreak @@ -389,3 +392,5 @@ android/pytorch_android_torchvision/.cxx # Claude Code local configuration CLAUDE.local.md +/test_*.py +/debug_*.py diff --git a/.lintrunner.toml b/.lintrunner.toml index 944829fa38977..679a04981b07a 100644 --- a/.lintrunner.toml +++ b/.lintrunner.toml @@ -13,7 +13,7 @@ exclude_patterns = [ '**/fb/**', 'functorch/docs/**', 'functorch/examples/**', - 'functorch/notebooks/**', + 'functorch/docs/source/tutorials/**', 'torch/_inductor/fx_passes/serialized_patterns/**', 'torch/_inductor/autoheuristic/artifacts/**', 'scripts/**', @@ -49,7 +49,7 @@ init_command = [ 'mccabe==0.7.0', 'pycodestyle==2.14.0', 'pyflakes==3.4.0', - 'torchfix==0.4.0 ; python_version >= "3.9" and python_version < "3.13"', + 'torchfix==0.4.0 ; python_version >= "3.10" and python_version < "3.13"', ] @@ -123,6 +123,7 @@ is_formatter = true code = 'MYPY' include_patterns = [ 'setup.py', + 'functorch/dim/**/*.py', 'torch/**/*.py', 'torch/**/*.pyi', 'caffe2/**/*.py', @@ -152,7 +153,7 @@ init_command = [ 'python3', 'tools/linter/adapters/pip_init.py', '--dry-run={{DRYRUN}}', - 'numpy==1.26.4 ; python_version >= "3.9" and python_version <= "3.11"', + 'numpy==1.26.4 ; python_version >= "3.10" and python_version <= "3.11"', 'numpy==2.1.0 ; python_version >= "3.12"', 'expecttest==0.3.0', 'mypy==1.16.0', @@ -195,6 +196,7 @@ exclude_patterns = [ 'tools/test/gen_operators_yaml_test.py', 'tools/test/gen_oplist_test.py', 'tools/test/test_selective_build.py', + 'tools/experimental/dynamic_shapes/torchfuzz/**', ] command = [ 'python3', @@ -964,7 +966,6 @@ exclude_patterns = [ 'test/jit/**', # should be run through test/test_jit.py 'test/ao/sparsity/**', # should be run through test/test_ao_sparsity.py 'test/fx/**', # should be run through test/test_fx.py - 'test/bottleneck_test/**', # excluded by test/run_test.py 'test/package/**', # excluded by test/run_test.py 'test/distributed/argparse_util_test.py', 'test/distributed/bin/test_script.py', @@ -1410,8 +1411,6 @@ exclude_patterns = [ 'torch/utils/benchmark/utils/timer.py', 'torch/utils/benchmark/utils/valgrind_wrapper/__init__.py', 'torch/utils/benchmark/utils/valgrind_wrapper/timer_interface.py', - 'torch/utils/bottleneck/__init__.py', - 'torch/utils/bottleneck/__main__.py', 'torch/utils/bundled_inputs.py', 'torch/utils/checkpoint.py', 'torch/utils/collect_env.py', @@ -1568,7 +1567,6 @@ include_patterns = [ exclude_patterns = [ 'caffe2/**', 'functorch/docs/**', - 'functorch/notebooks/**', 'torch/_inductor/fx_passes/serialized_patterns/**', 'torch/_inductor/autoheuristic/artifacts/**', 'test/dynamo/cpython/**', diff --git a/BUILD.bazel b/BUILD.bazel index 2cbd36f06761b..5d7625b402947 100644 --- a/BUILD.bazel +++ b/BUILD.bazel @@ -810,7 +810,7 @@ cc_library( name = "torch_python", srcs = libtorch_python_core_sources + if_cuda(libtorch_python_cuda_sources) - + if_cuda(libtorch_python_distributed_sources) + + libtorch_python_distributed_sources + GENERATED_AUTOGRAD_PYTHON, hdrs = glob([ "torch/csrc/generic/*.cpp", @@ -832,36 +832,6 @@ pybind_extension( ], ) -cc_library( - name = "functorch", - hdrs = glob([ - "functorch/csrc/dim/*.h", - ]), - srcs = glob([ - "functorch/csrc/dim/*.cpp", - ]), - deps = [ - ":aten_nvrtc", - ":torch_python", - "@pybind11", - ], -) - -pybind_extension( - name = "functorch/_C", - copts=[ - "-DTORCH_EXTENSION_NAME=_C" - ], - srcs = [ - "functorch/csrc/init_dim_only.cpp", - ], - deps = [ - ":functorch", - ":torch_python", - ":aten_nvrtc", - ], -) - cc_binary( name = "torch/bin/torch_shm_manager", srcs = [ @@ -902,7 +872,6 @@ py_library( ], data = [ ":torch/_C.so", - ":functorch/_C.so", ":torch/bin/torch_shm_manager", ], ) diff --git a/CMakeLists.txt b/CMakeLists.txt index 21c867dd6b6e6..eb973f33fb8f9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,5 +1,4 @@ cmake_minimum_required(VERSION 3.27 FATAL_ERROR) -# cmake_policy(SET CMP0022 NEW) cmake_policy(SET CMP0023 NEW) # Use compiler ID "AppleClang" instead of "Clang" for XCode. Not setting this # sometimes makes XCode C compiler gets detected as "Clang", even when the C++ @@ -380,6 +379,13 @@ cmake_dependent_option(BUILD_BUNDLE_PTXAS "Bundle PTX into torch/bin fodler" OFF "USE_CUDA" OFF) cmake_dependent_option(USE_KLEIDIAI "Use KleidiAI for the ARM CPU & AARCH64 architecture." ON "CPU_AARCH64" OFF) +# prioritized text linker, ON by default for AArch64+Linux, option visible to all AArch64, x86 and ppc64le. +set(USE_PRIORITIZED_TEXT_DEFAULT OFF) +if(LINUX AND CPU_AARCH64) + set(USE_PRIORITIZED_TEXT_DEFAULT ON) +endif() +cmake_dependent_option(USE_PRIORITIZED_TEXT_FOR_LD "Use prioritized text linker for ld." + "${USE_PRIORITIZED_TEXT_DEFAULT}" "CPU_INTEL OR CPU_AARCH64 OR CPU_POWER" OFF) option(USE_MIMALLOC "Use mimalloc" OFF) # Enable third party mimalloc library to improve memory allocation performance @@ -657,6 +663,11 @@ endif(MSVC) string(APPEND CMAKE_CUDA_FLAGS " -Xfatbin -compress-all") +# Set linker max-page-size to 64KiB on AArch64 Linux +if(LINUX AND CPU_AARCH64) + add_link_options_if_supported("-z,max-page-size=0x10000") +endif() + # Set INTERN_BUILD_MOBILE for all mobile builds. Components that are not # applicable to mobile are disabled by this variable. Setting # `BUILD_PYTORCH_MOBILE_WITH_HOST_TOOLCHAIN` environment variable can force it @@ -874,7 +885,7 @@ cmake_dependent_option( "Whether to build the flash_attention kernel for scaled dot product attention.\ Will be disabled if not supported by the platform" ON - "USE_CUDA OR USE_ROCM;NOT MSVC" + "(USE_CUDA AND NOT MSVC) OR USE_ROCM" OFF) cmake_dependent_option( @@ -885,6 +896,17 @@ cmake_dependent_option( "USE_CUDA OR USE_ROCM" OFF) +IF(USE_FBGEMM_GENAI AND USE_ROCM AND NOT "gfx942" IN_LIST PYTORCH_ROCM_ARCH) + message(WARNING "Unsupported ROCM arch for FBGEMM GenAI, will set USE_FBGEMM_GENAI to OFF") + set(USE_FBGEMM_GENAI off) +endif() + +# Set USE_FBGEMM_GENAI to ON for CUDA build on SM100. +if(USE_CUDA AND "$ENV{TORCH_CUDA_ARCH_LIST}" MATCHES "10.0" AND CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.8 AND NOT WIN32) + message(STATUS "Setting USE_FBGEMM_GENAI to ON, doing CUDA build for SM100a") + set(USE_FBGEMM_GENAI ON) +endif() + # CAVEAT: Again, Flash Attention2 will error while building for sm52 while Mem # Eff Attention won't cmake_dependent_option( @@ -898,7 +920,7 @@ cmake_dependent_option( # USE_FLASH_ATTENTION -> USE_ROCM -> Dependencies.cmake -> aotriton.cmake # if(USE_ROCM) - if(UNIX AND (USE_FLASH_ATTENTION OR USE_MEM_EFF_ATTENTION)) + if(USE_FLASH_ATTENTION OR USE_MEM_EFF_ATTENTION) include(cmake/External/aotriton.cmake) endif() endif() @@ -1368,10 +1390,6 @@ endif() include(cmake/Summary.cmake) caffe2_print_configuration_summary() -if(BUILD_FUNCTORCH) - add_subdirectory(functorch) -endif() - # Parse custom debug info if(DEFINED USE_CUSTOM_DEBINFO) string(REPLACE ";" " " SOURCE_FILES "${USE_CUSTOM_DEBINFO}") @@ -1410,3 +1428,57 @@ if(BUILD_BUNDLE_PTXAS AND USE_CUDA) install(PROGRAMS "${PROJECT_BINARY_DIR}/ptxas" DESTINATION "${CMAKE_INSTALL_BINDIR}") endif() + +if(USE_PRIORITIZED_TEXT_FOR_LD) + add_compile_options( + $<$:-ffunction-sections> + $<$:-fdata-sections> + ) + set(LINKER_SCRIPT_FILE_OUT "${CMAKE_SOURCE_DIR}/cmake/linker_script.ld") + set(LINKER_SCRIPT_FILE_IN "${CMAKE_SOURCE_DIR}/cmake/prioritized_text.txt") + + add_custom_command( + OUTPUT "${LINKER_SCRIPT_FILE_OUT}" + COMMAND ${Python_EXECUTABLE} ${CMAKE_SOURCE_DIR}/tools/setup_helpers/generate_linker_script.py --filein "${LINKER_SCRIPT_FILE_IN}" --fout "${LINKER_SCRIPT_FILE_OUT}" + DEPENDS ${CMAKE_SOURCE_DIR}/tools/setup_helpers/generate_linker_script.py "${LINKER_SCRIPT_FILE_IN}" + COMMENT "Generating prioritized text linker files" + VERBATIM + ) + + add_custom_target(generate_linker_script DEPENDS "${LINKER_SCRIPT_FILE_OUT}") + + if(BUILD_PYTHON) + set(LINKER_OPT_TARGETS torch_python) + endif() + + if(NOT BUILD_LIBTORCHLESS) + list(APPEND LINKER_OPT_TARGETS torch_cpu c10) + if(USE_CUDA) + list(APPEND LINKER_OPT_TARGETS torch_cuda c10_cuda) + endif() + if(USE_XPU) + list(APPEND LINKER_OPT_TARGETS torch_xpu c10_xpu) + endif() + if(USE_ROCM) + list(APPEND LINKER_OPT_TARGETS torch_hip c10_hip) + endif() + endif() + + foreach(tgt IN LISTS LINKER_OPT_TARGETS) + if(TARGET ${tgt}) + add_dependencies("${tgt}" generate_linker_script) + target_link_options_if_supported(${tgt} "-T,${LINKER_SCRIPT_FILE_OUT}") + set_property(TARGET ${tgt} APPEND PROPERTY LINK_DEPENDS "${LINKER_SCRIPT_FILE_OUT}") + else() + message(WARNING "Requested target '${tgt}' for linker script optimization was not found.") + endif() + endforeach() + +else() + if(LINUX AND CPU_AARCH64) + message(WARNING [[ + It is strongly recommend to enable linker script optimization for all AArch64 Linux builds. + To do so please export USE_PRIORITIZED_TEXT_FOR_LD=1 + ]]) + endif() +endif() diff --git a/README.md b/README.md index 99e6dabd16181..4356491e178e7 100644 --- a/README.md +++ b/README.md @@ -161,7 +161,7 @@ They require JetPack 4.2 and above, and [@dusty-nv](https://github.com/dusty-nv) #### Prerequisites If you are installing from source, you will need: -- Python 3.9 or later +- Python 3.10 or later - A compiler that fully supports C++17, such as clang or gcc (gcc 9.4.0 or newer is required, on Linux) - Visual Studio or Visual Studio Build Tool (Windows only) diff --git a/RELEASE.md b/RELEASE.md index 047bb10161f71..52371e73f0a6d 100644 --- a/RELEASE.md +++ b/RELEASE.md @@ -50,6 +50,7 @@ Following is the Release Compatibility Matrix for PyTorch releases: | PyTorch version | Python | C++ | Stable CUDA | Experimental CUDA | Stable ROCm | | --- | --- | --- | --- | --- | --- | +| 2.9 | >=3.10, <=(3.14, 3.14t experimental) | C++17 | CUDA 12.6 (CUDNN 9.10.2.21), CUDA 12.8 (CUDNN 9.10.2.21) | CUDA 13.0 (CUDNN 9.13.0.50) | ROCm 6.4 | | 2.8 | >=3.9, <=3.13, (3.13t experimental) | C++17 | CUDA 12.6 (CUDNN 9.10.2.21), CUDA 12.8 (CUDNN 9.10.2.21) | CUDA 12.9 (CUDNN 9.10.2.21) | ROCm 6.4 | | 2.7 | >=3.9, <=3.13, (3.13t experimental) | C++17 | CUDA 11.8 (CUDNN 9.1.0.70), CUDA 12.6 (CUDNN 9.5.1.17) | CUDA 12.8 (CUDNN 9.7.1.26) | ROCm 6.3 | | 2.6 | >=3.9, <=3.13, (3.13t experimental) | C++17 | CUDA 11.8, CUDA 12.4 (CUDNN 9.1.0.70) | CUDA 12.6 (CUDNN 9.5.1.17) | ROCm 6.2.4 | diff --git a/SECURITY.md b/SECURITY.md index 3baa145df7953..16d72ef1ea08e 100644 --- a/SECURITY.md +++ b/SECURITY.md @@ -16,6 +16,8 @@ However, if you believe you have found a security vulnerability in PyTorch, we e Please report security issues using https://github.com/pytorch/pytorch/security/advisories/new +All reports submitted thru the security advisories mechanism would **either be made public or dismissed by the team within 90 days of the submission**. If advisory has been closed on the grounds that it is not a security issue, please do not hesitate to create an [new issue](https://github.com/pytorch/pytorch/issues/new?template=bug-report.yml) as it is still likely a valid issue within the framework. + Please refer to the following page for our responsible disclosure policy, reward guidelines, and those things that should not be reported: https://www.facebook.com/whitehat diff --git a/aten/src/ATen/CMakeLists.txt b/aten/src/ATen/CMakeLists.txt index a3c98f37a0242..b9f8995082ccf 100644 --- a/aten/src/ATen/CMakeLists.txt +++ b/aten/src/ATen/CMakeLists.txt @@ -270,6 +270,14 @@ IF(USE_FBGEMM_GENAI) "${FBGEMM_GENAI_SRCS}/cutlass_extensions/**/*.cu") list(FILTER fbgemm_genai_native_cuda_cu INCLUDE REGEX ${FBGEMM_CUTLASS_KERNELS_REGEX}) + # PyTorch is not built for 10.0a in CI, due to lack of portability, + # so we need to explicitly build these files for 10.0a. + foreach(cu_file ${fbgemm_genai_native_cuda_cu}) + _BUILD_FOR_ADDITIONAL_ARCHS( + "${cu_file}" + "100a") + endforeach() + file(GLOB_RECURSE fbgemm_genai_native_cuda_cpp "${FBGEMM_GENAI_SRCS}/common/*.cpp" ) @@ -315,10 +323,20 @@ IF(USE_FBGEMM_GENAI) -greedy-reverse-local-assignment=1 -fhip-new-launch-api) + # Only compile for gfx942 for now. + # This is rather hacky, I could not figure out a clean solution :( + set(HIP_CLANG_FLAGS_ORIGINAL ${HIP_CLANG_FLAGS}) + string(REGEX REPLACE "--offload-arch=[^ ]*" "" FILTERED_HIP_CLANG_FLAGS "${HIP_CLANG_FLAGS}") + if("gfx942" IN_LIST PYTORCH_ROCM_ARCH) + list(APPEND FILTERED_HIP_CLANG_FLAGS --offload-arch=gfx942;) + endif() + set(HIP_CLANG_FLAGS ${FILTERED_HIP_CLANG_FLAGS}) + hip_add_library( fbgemm_genai STATIC ${fbgemm_genai_native_rocm_hip} HIPCC_OPTIONS ${HIP_HCC_FLAGS} ${FBGEMM_GENAI_EXTRA_HIPCC_FLAGS}) + set(HIP_CLANG_FLAGS ${HIP_CLANG_FLAGS_ORIGINAL}) set_target_properties(fbgemm_genai PROPERTIES POSITION_INDEPENDENT_CODE ON) target_compile_definitions(fbgemm_genai PRIVATE FBGEMM_GENAI_NO_EXTENDED_SHAPES) diff --git a/aten/src/ATen/Context.cpp b/aten/src/ATen/Context.cpp index 4d48084b0ab89..7a8d02be530e3 100644 --- a/aten/src/ATen/Context.cpp +++ b/aten/src/ATen/Context.cpp @@ -180,7 +180,7 @@ void Context::setUserEnabledNNPACK(bool e) { } bool Context::allowTF32CuDNN(const std::string& op) const { - if (op.size() == 0){ + if (op.empty()){ bool allow_tf32_rnn = float32Precision("cuda", "rnn") == "tf32"; bool allow_tf32_conv = float32Precision("cuda", "conv") == "tf32"; TORCH_CHECK( @@ -281,9 +281,6 @@ bool Context::userEnabledOverrideableSDP() const { static constexpr const auto cublas_config_var_name = "CUBLAS_WORKSPACE_CONFIG"; static constexpr const std::array cublas_deterministic_configs = {":4096:8", ":16:8"}; -#ifdef USE_ROCM -static constexpr const auto hipblaslt_allow_tf32 = "HIPBLASLT_ALLOW_TF32"; -#endif bool Context::checkCuBLASConfigDeterministic() { // If using CUDA 10.2 or greater, need to make sure CuBLAS workspace config @@ -343,12 +340,6 @@ void Context::setImmediateMiopen(bool b) { } bool Context::allowTF32CuBLAS() const { -#ifdef USE_ROCM - const auto allow_tf32 = c10::utils::check_env(hipblaslt_allow_tf32); - if (allow_tf32 != true) { - return false; - } -#endif bool legacy_allow_tf32 = float32_matmul_precision != at::Float32MatmulPrecision::HIGHEST; bool allow_tf32_new = float32Precision("cuda", "matmul") == "tf32"; TORCH_CHECK( @@ -362,14 +353,6 @@ bool Context::allowTF32CuBLAS() const { } void Context::setAllowTF32CuBLAS(bool b) { -#ifdef USE_ROCM - const auto allow_tf32 = c10::utils::check_env(hipblaslt_allow_tf32); - if (allow_tf32 != true) { - C10_LOG_FIRST_N(INFO, 10) << "torch.backends.cuda.matmul.allow_tf32 is not supported on ROCm by default. " - << "Please set environment variable HIPBLASLT_ALLOW_TF32=1 to enable it."; - return; - } -#endif float32_matmul_precision = b ? at::Float32MatmulPrecision::HIGH : at::Float32MatmulPrecision::HIGHEST; setFloat32Precision("cuda", "matmul", b ? "tf32" : "ieee"); } @@ -443,7 +426,7 @@ void Context::setFloat32Precision(const std::string& backend, const std::string& std::string msg; auto iterp = _fp32_precisions.find(backend); TORCH_CHECK(iterp != _fp32_precisions.end()); - for (auto p : iterp->second) { + for (const auto& p : iterp->second) { msg += p; msg += " "; } diff --git a/aten/src/ATen/DLConvertor.cpp b/aten/src/ATen/DLConvertor.cpp index 98ad757946bec..7c2ad5c609e7b 100644 --- a/aten/src/ATen/DLConvertor.cpp +++ b/aten/src/ATen/DLConvertor.cpp @@ -65,14 +65,24 @@ DLDataType getDLDataType(const Tensor& t) { break; // TODO(#146647): use macro here instead of spelling out each shell dtype case ScalarType::Float8_e5m2: + dtype.code = DLDataTypeCode::kDLFloat8_e5m2; + break; case ScalarType::Float8_e5m2fnuz: + dtype.code = DLDataTypeCode::kDLFloat8_e5m2fnuz; + break; case ScalarType::Float8_e4m3fn: + dtype.code = DLDataTypeCode::kDLFloat8_e4m3fn; + break; case ScalarType::Float8_e4m3fnuz: + dtype.code = DLDataTypeCode::kDLFloat8_e4m3fnuz; + break; case ScalarType::Float8_e8m0fnu: - TORCH_CHECK_BUFFER(false, "float8 types are not supported by dlpack"); + dtype.code = DLDataTypeCode::kDLFloat8_e8m0fnu; break; case ScalarType::Float4_e2m1fn_x2: - TORCH_CHECK_BUFFER(false, "float4 types are not supported by dlpack"); + dtype.code = DLDataTypeCode::kDLFloat4_e2m1fn; + dtype.lanes = 2; + dtype.bits = 4; break; case ScalarType::QInt8: case ScalarType::QUInt8: @@ -177,7 +187,11 @@ static Device getATenDevice(DLDeviceType type, c10::DeviceIndex index, void* dat ScalarType toScalarType(const DLDataType& dtype) { ScalarType stype = ScalarType::Undefined; - TORCH_CHECK_BUFFER(dtype.lanes == 1, "ATen does not support lanes != 1"); + if (dtype.code != DLDataTypeCode::kDLFloat4_e2m1fn) { + TORCH_CHECK_BUFFER( + dtype.lanes == 1, + "ATen does not support lanes != 1 for dtype code", std::to_string(dtype.code)); + } switch (dtype.code) { case DLDataTypeCode::kDLUInt: switch (dtype.bits) { @@ -269,6 +283,73 @@ ScalarType toScalarType(const DLDataType& dtype) { false, "Unsupported kDLBool bits ", std::to_string(dtype.bits)); } break; + case DLDataTypeCode::kDLFloat8_e5m2: + switch (dtype.bits) { + case 8: + stype = ScalarType::Float8_e5m2; + break; + default: + TORCH_CHECK_BUFFER( + false, "Unsupported kDLFloat8_e5m2 bits ", std::to_string(dtype.bits)); + } + break; + case DLDataTypeCode::kDLFloat8_e5m2fnuz: + switch (dtype.bits) { + case 8: + stype = ScalarType::Float8_e5m2fnuz; + break; + default: + TORCH_CHECK_BUFFER( + false, "Unsupported kDLFloat8_e5m2fnuz bits ", std::to_string(dtype.bits)); + } + break; + case DLDataTypeCode::kDLFloat8_e4m3fn: + switch (dtype.bits) { + case 8: + stype = ScalarType::Float8_e4m3fn; + break; + default: + TORCH_CHECK_BUFFER( + false, "Unsupported kDLFloat8_e4m3fn bits ", std::to_string(dtype.bits)); + } + break; + case DLDataTypeCode::kDLFloat8_e4m3fnuz: + switch (dtype.bits) { + case 8: + stype = ScalarType::Float8_e4m3fnuz; + break; + default: + TORCH_CHECK_BUFFER( + false, "Unsupported kDLFloat8_e4m3fnuz bits ", std::to_string(dtype.bits)); + } + break; + case DLDataTypeCode::kDLFloat8_e8m0fnu: + switch (dtype.bits) { + case 8: + stype = ScalarType::Float8_e8m0fnu; + break; + default: + TORCH_CHECK_BUFFER( + false, "Unsupported kDLFloat8_e8m0fnu bits ", std::to_string(dtype.bits)); + } + break; + case DLDataTypeCode::kDLFloat4_e2m1fn: + switch (dtype.bits) { + case 4: + switch (dtype.lanes) { + case 2: + stype = ScalarType::Float4_e2m1fn_x2; + break; + default: + TORCH_CHECK_BUFFER( + false, "Unsupported kDLFloat4_e2m1fn lanes ", std::to_string(dtype.lanes)); + } + break; + default: + TORCH_CHECK_BUFFER( + false, "Unsupported kDLFloat4_e2m1fn bits ", std::to_string(dtype.bits)); + } + break; default: TORCH_CHECK_BUFFER(false, "Unsupported code ", std::to_string(dtype.code)); } @@ -320,30 +401,13 @@ T* toDLPackImpl(const Tensor& src) { // The following code detects whether the src follows // a continuous pattern. If the src follows such pattern (common-case) // then we do not need to normalize the strides. - bool need_normalize_strides = false; - int64_t expected_stride = 1; - for (int i = src.dim() - 1; i >= 0; i--) { - // detect if we do not meet continuous pattern - // and the size is 1, so there is opportunity to normalize - if (src.stride(i) != expected_stride && src.size(i) == 1) { - need_normalize_strides = true; - break; - } - expected_stride *= src.size(i); - } - + bool need_normalize_strides = src.dim() == 1 && src.size(0) == 1 && src.stride(0) != 1; // less common case, try normalizing the strides if (need_normalize_strides) { // create a new tensor with possibly normalized strides // gh-83069 auto shape = src.sizes(); - auto strides = src.strides().vec(); - for (int i = 0; i < src.dim(); i++) { - if (shape[i] < 2) { - strides[i] = 1; - } - } - view = src.as_strided(shape, strides, src.storage_offset()); + view = src.as_strided(shape, {1}, src.storage_offset()); } ATenDLMTensor* atDLMTensor(new ATenDLMTensor); @@ -354,8 +418,8 @@ T* toDLPackImpl(const Tensor& src) { atDLMTensor->tensor.dl_tensor.device = torchDeviceToDLDevice(src.device()); atDLMTensor->tensor.dl_tensor.ndim = static_cast(src.dim()); atDLMTensor->tensor.dl_tensor.dtype = getDLDataType(src); - atDLMTensor->tensor.dl_tensor.shape = view.sizes().data(); - atDLMTensor->tensor.dl_tensor.strides = view.strides().data(); + atDLMTensor->tensor.dl_tensor.shape = const_cast(view.sizes().data()); + atDLMTensor->tensor.dl_tensor.strides = const_cast(view.strides().data()); atDLMTensor->tensor.dl_tensor.byte_offset = 0; fillVersion(&atDLMTensor->tensor); diff --git a/aten/src/ATen/FunctionalStorageImpl.cpp b/aten/src/ATen/FunctionalStorageImpl.cpp index a5512818343fb..2cf8d9727f658 100644 --- a/aten/src/ATen/FunctionalStorageImpl.cpp +++ b/aten/src/ATen/FunctionalStorageImpl.cpp @@ -102,7 +102,7 @@ FunctionalStorageImpl::FunctionalStorageImpl(const Tensor& base) // SparseTensorImpl has no storage, so we cannot query its nbytes. // (original_storage_size is only used for storage resizing in fsdp anyway, which does not apply to sparse) // Same for XLA - if (base.unsafeGetTensorImpl()->has_storage() && base.device().type() != c10::DeviceType::XLA) { + if (base.unsafeGetTensorImpl()->has_storage() && data_ptr().device().type() != c10::DeviceType::XLA) { original_storage_size_ = base.unsafeGetTensorImpl()->unsafe_storage().unsafeGetStorageImpl()->sym_nbytes(); } else { original_storage_size_ = -1; diff --git a/aten/src/ATen/FunctionalTensorWrapper.cpp b/aten/src/ATen/FunctionalTensorWrapper.cpp index 7d5e4e84e861d..0a2fa153a6cf1 100644 --- a/aten/src/ATen/FunctionalTensorWrapper.cpp +++ b/aten/src/ATen/FunctionalTensorWrapper.cpp @@ -133,7 +133,7 @@ FunctionalTensorWrapper::FunctionalTensorWrapper(const Tensor& view_value, const : c10::TensorImpl( c10::DispatchKeySet(DispatchKey::Functionalize), view_value.dtype(), - view_value.device() + base->storage().data_ptr().device() ), value_(view_value), is_multi_output_view_(base->is_multi_output_view_ || meta.is_multi_output), @@ -485,7 +485,10 @@ void FunctionalTensorWrapper::shallow_copy_from(const c10::intrusive_ptrdevice(); + // The storage pointer already uses the underlying tensor custom device (if + // applicable) to extract the device. So, we dont have to recurse again by + // doing value_.unsafeGetTensorImpl()->device(). + return storage().data_ptr().device(); } at::IntArrayRef FunctionalTensorWrapper::sizes_custom() const { return value_.unsafeGetTensorImpl()->sizes(); diff --git a/aten/src/ATen/SparseTensorImpl.h b/aten/src/ATen/SparseTensorImpl.h index 39f77664de864..b10795fbc37eb 100644 --- a/aten/src/ATen/SparseTensorImpl.h +++ b/aten/src/ATen/SparseTensorImpl.h @@ -133,12 +133,12 @@ struct TORCH_API SparseTensorImpl : public TensorImpl { "resize_ called on tensor with symbolic shape") TORCH_CHECK( sparse_dim + dense_dim == static_cast(size.size()), - "number of dimensions must be sparse_dim (", + "'len(size) == sparse_dim + dense_dim' is not satisfied: len(size) = ", + size.size(), + ", sparse_dim = ", sparse_dim, - ") + dense_dim (", - dense_dim, - "), but got ", - size.size()); + ", dense_dim = ", + dense_dim); if (nnz() > 0) { [[maybe_unused]] auto constexpr alt_options_msg = "You could try the following options:\n\ @@ -254,12 +254,12 @@ struct TORCH_API SparseTensorImpl : public TensorImpl { "resize_and_clear_ called on tensor with symbolic shape") TORCH_CHECK( sparse_dim + dense_dim == static_cast(size.size()), - "number of dimensions must be sparse_dim (", + "'len(size) == sparse_dim + dense_dim' is not satisfied: len(size) = ", + size.size(), + ", sparse_dim = ", sparse_dim, - ") + dense_dim (", - dense_dim, - "), but got ", - size.size()); + ", dense_dim = ", + dense_dim); set_sizes_and_strides(size, std::vector(size.size())); sparse_dim_ = sparse_dim; diff --git a/aten/src/ATen/cuda/CUDABlas.cpp b/aten/src/ATen/cuda/CUDABlas.cpp index 0d319ea593840..07d5ae5d9886e 100644 --- a/aten/src/ATen/cuda/CUDABlas.cpp +++ b/aten/src/ATen/cuda/CUDABlas.cpp @@ -644,6 +644,8 @@ inline void bgemm_internal_cublas_half_helper(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYP void * beta_ptr = &fbeta; #ifdef USE_ROCM int flag = 0; + rocblas_datatype c_type = std::is_same::value ? rocblas_datatype_f32_r : rocblas_datatype_f16_r; + rocblas_datatype d_type = c_type; #if USE_GEMM_FLAGS_FP16_ALT_IMPL flag = at::ROCmBackwardPassGuard::is_backward_pass() ? rocblas_gemm_flags_fp16_alt_impl : 0; #endif @@ -652,8 +654,8 @@ inline void bgemm_internal_cublas_half_helper(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYP hipOperationToRocOperation(opb), (int)m, (int)n, (int)k, (void*)alpha_ptr, a, rocblas_datatype_f16_r, (int)lda, stridea, b, rocblas_datatype_f16_r, (int)ldb, strideb, - (void*)beta_ptr, c, rocblas_datatype_f16_r, (int)ldc, stridec, - c, rocblas_datatype_f16_r, (int)ldc, stridec, + (void*)beta_ptr, c, c_type, (int)ldc, stridec, + c, d_type, (int)ldc, stridec, (int) num_batches, rocblas_datatype_f32_r, rocblas_gemm_algo_standard, 0, flag))); #else @@ -1096,6 +1098,8 @@ inline void gemm_internal_cublas_half_helper(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE( GEMM_CHECK_ARGVALUES(at::Half); #ifdef USE_ROCM int flag = 0; + rocblas_datatype c_type = std::is_same::value ? rocblas_datatype_f32_r : rocblas_datatype_f16_r; + rocblas_datatype d_type = c_type; #if USE_GEMM_FLAGS_FP16_ALT_IMPL flag = at::ROCmBackwardPassGuard::is_backward_pass() ? rocblas_gemm_flags_fp16_alt_impl : 0; #endif @@ -1115,10 +1119,10 @@ inline void gemm_internal_cublas_half_helper(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE( ldb, beta_ptr, c, - rocblas_datatype_f16_r, + c_type, ldc, c, - rocblas_datatype_f16_r, + d_type, ldc, rocblas_datatype_f32_r, rocblas_gemm_algo_standard, @@ -1633,9 +1637,7 @@ bool gemm_and_bias( if (activation == GEMMAndBiasActivationEpilogue::RELU) { epilogue = CUBLASLT_EPILOGUE_RELU_BIAS; } else if (activation == GEMMAndBiasActivationEpilogue::GELU) { -#if CUDA_VERSION >= 11040 || defined(USE_ROCM) epilogue = CUBLASLT_EPILOGUE_GELU_BIAS; -#endif } if (bias != nullptr) { @@ -1927,7 +1929,6 @@ void scaled_gemm( bool use_fast_accum) { // Note: see `cublasCommonArgs` for various non-intuitive manupulations // of input arguments to this function. -#if CUDA_VERSION >= 11080 || defined(USE_ROCM) const auto computeType = CUBLAS_COMPUTE_32F; const auto scaleType = CUDA_R_32F; const float alpha_val = 1.0; @@ -1950,8 +1951,8 @@ void scaled_gemm( #if ROCM_VERSION >= 70000 if (at::detail::getCUDAHooks().isGPUArch({"gfx950"})) { // TODO: add constraints based on hipblaslt internals - TORCH_CHECK((m % 32 == 0) && (n % 32 == 0) && (k % 32 == 0), - "Matrix dimensions must be multiples of 32 for MX format. " + TORCH_CHECK((m % 16 == 0) && (n % 16 == 0) && (k % 128 == 0), + "M, N must be multiples of 16 and K should be multiple of 128 for MX format. " "Got m=", m, ", n=", n, ", k=", k); } #endif @@ -2129,8 +2130,6 @@ void scaled_gemm( " scaleType ", scaleType); return; -#endif // if CUDA_VERSION >= 11080 || defined(USE_ROCM) - TORCH_CHECK(false, "scaled_gemm is only supported for CUDA 11.8 and above"); } void int8_gemm( diff --git a/aten/src/ATen/cuda/CUDAGeneratorImpl.cpp b/aten/src/ATen/cuda/CUDAGeneratorImpl.cpp index 422890084c900..f95faa94e6113 100644 --- a/aten/src/ATen/cuda/CUDAGeneratorImpl.cpp +++ b/aten/src/ATen/cuda/CUDAGeneratorImpl.cpp @@ -266,11 +266,14 @@ CUDAGeneratorImpl::CUDAGeneratorImpl( * See Note [Acquire lock when using random generators] */ void CUDAGeneratorImpl::set_current_seed(uint64_t seed) { - at::cuda::assertNotCapturing( - "Cannot call CUDAGeneratorImpl::set_current_seed"); - state_->seed_ = seed; - state_->philox_offset_per_thread_ = 0; - no_reset_rnn_state_.clear(); + if (C10_LIKELY(at::cuda::currentStreamCaptureStatus() == at::cuda::CaptureStatus::None)) { + state_->seed_ = seed; + state_->philox_offset_per_thread_ = 0; + no_reset_rnn_state_.clear(); + } else { + TORCH_CHECK(state_->seed_ == seed, "CUDAGeneratorImpl::set_current_seed can be called during stream capture only if new seed is the same as the original seed."); + // no-op case + } } /** @@ -299,9 +302,6 @@ uint64_t CUDAGeneratorImpl::get_offset() const { * Gets the current seed of CUDAGeneratorImpl. */ uint64_t CUDAGeneratorImpl::current_seed() const { - // Debatable if current_seed() should be allowed in captured regions. - // Conservatively disallow it for now. - at::cuda::assertNotCapturing("Cannot call CUDAGeneratorImpl::current_seed"); return state_->seed_; } @@ -346,8 +346,6 @@ c10::intrusive_ptr CUDAGeneratorImpl::get_state() const { * and size of the internal state. */ void CUDAGeneratorImpl::set_state(const c10::TensorImpl& new_state) { - at::cuda::assertNotCapturing( - "Please ensure to utilize the CUDAGeneratorImpl::set_state_index method during capturing."); static const size_t seed_size = sizeof(uint64_t); static const size_t offset_size = sizeof(int64_t); static const size_t total_size = seed_size + offset_size; @@ -402,15 +400,27 @@ c10::intrusive_ptr CUDAGeneratorImpl::graphsafe_get_state() */ void CUDAGeneratorImpl::set_philox_offset_per_thread(uint64_t offset) { // see Note [Why enforce RNG offset % 4 == 0?] + + // Note: If you use CUDNN RNN's, calling + // set_philox_offset_per_thread instead of set_offset will cause the + // cudnn RNN rng state to become stale. TORCH_CHECK(offset % 4 == 0, "offset must be a multiple of 4"); - state_->philox_offset_per_thread_ = offset; + if (C10_LIKELY(at::cuda::currentStreamCaptureStatus() == at::cuda::CaptureStatus::None)) { + state_->philox_offset_per_thread_ = offset; + } else { + state_->offset_intragraph_ = offset; + } } /** * Gets the current philox_offset_per_thread_ of CUDAGeneratorImpl. */ uint64_t CUDAGeneratorImpl::philox_offset_per_thread() const { - return state_->philox_offset_per_thread_; + if (C10_LIKELY(at::cuda::currentStreamCaptureStatus() == at::cuda::CaptureStatus::None)) { + return state_->philox_offset_per_thread_; + } else { + return state_->offset_intragraph_; + } } /** diff --git a/aten/src/ATen/dlpack.h b/aten/src/ATen/dlpack.h index 82c0668211188..f1b3ae2b7760b 100644 --- a/aten/src/ATen/dlpack.h +++ b/aten/src/ATen/dlpack.h @@ -19,7 +19,7 @@ #define DLPACK_MAJOR_VERSION 1 /*! \brief The current minor version of dlpack */ -#define DLPACK_MINOR_VERSION 0 +#define DLPACK_MINOR_VERSION 1 /*! \brief DLPACK_DLL prefix for windows */ #ifdef _WIN32 @@ -32,9 +32,7 @@ #define DLPACK_DLL #endif -// NOLINTNEXTLINE(modernize-deprecated-headers) #include -// NOLINTNEXTLINE(modernize-deprecated-headers) #include #ifdef __cplusplus @@ -159,6 +157,26 @@ typedef enum { kDLComplex = 5U, /*! \brief boolean */ kDLBool = 6U, + /*! \brief FP8 data types */ + kDLFloat8_e3m4 = 7U, + kDLFloat8_e4m3 = 8U, + kDLFloat8_e4m3b11fnuz = 9U, + kDLFloat8_e4m3fn = 10U, + kDLFloat8_e4m3fnuz = 11U, + kDLFloat8_e5m2 = 12U, + kDLFloat8_e5m2fnuz = 13U, + kDLFloat8_e8m0fnu = 14U, + /*! \brief FP6 data types + * Setting bits != 6 is currently unspecified, and the producer must ensure it is set + * while the consumer must stop importing if the value is unexpected. + */ + kDLFloat6_e2m3fn = 15U, + kDLFloat6_e3m2fn = 16U, + /*! \brief FP4 data types + * Setting bits != 4 is currently unspecified, and the producer must ensure it is set + * while the consumer must stop importing if the value is unexpected. + */ + kDLFloat4_e2m1fn = 17U, } DLDataTypeCode; /*! @@ -172,6 +190,12 @@ typedef enum { * - int8: type_code = 0, bits = 8, lanes = 1 * - std::complex: type_code = 5, bits = 64, lanes = 1 * - bool: type_code = 6, bits = 8, lanes = 1 (as per common array library convention, the underlying storage size of bool is 8 bits) + * - float8_e4m3: type_code = 8, bits = 8, lanes = 1 (packed in memory) + * - float6_e3m2fn: type_code = 16, bits = 6, lanes = 1 (packed in memory) + * - float4_e2m1fn: type_code = 17, bits = 4, lanes = 1 (packed in memory) + * + * When a sub-byte type is packed, DLPack requires the data to be in little bit-endian, i.e., + * for a packed data set D ((D >> (i * bits)) && bit_mask) stores the i-th element. */ typedef struct { /*! @@ -229,12 +253,12 @@ typedef struct { /*! \brief The data type of the pointer*/ DLDataType dtype; /*! \brief The shape of the tensor */ - const int64_t* shape; + int64_t* shape; /*! * \brief strides of the tensor (in number of elements, not bytes) * can be NULL, indicating tensor is compact and row-majored. */ - const int64_t* strides; + int64_t* strides; /*! \brief The offset in bytes to the beginning pointer to data */ uint64_t byte_offset; } DLTensor; @@ -269,7 +293,7 @@ typedef struct DLManagedTensor { void (*deleter)(struct DLManagedTensor * self); } DLManagedTensor; -// bit masks used in in the DLManagedTensorVersioned +// bit masks used in the DLManagedTensorVersioned /*! \brief bit mask to indicate that the tensor is read only. */ #define DLPACK_FLAG_BITMASK_READ_ONLY (1UL << 0UL) @@ -282,6 +306,14 @@ typedef struct DLManagedTensor { */ #define DLPACK_FLAG_BITMASK_IS_COPIED (1UL << 1UL) +/* + * \brief bit mask to indicate that whether a sub-byte type is packed or padded. + * + * The default for sub-byte types (ex: fp4/fp6) is assumed packed. This flag can + * be set by the producer to signal that a tensor of sub-byte type is padded. + */ +#define DLPACK_FLAG_BITMASK_IS_SUBBYTE_TYPE_PADDED (1UL << 2UL) + /*! * \brief A versioned and managed C Tensor object, manage memory of DLTensor. * diff --git a/aten/src/ATen/functorch/BatchRulesUnaryOps.cpp b/aten/src/ATen/functorch/BatchRulesUnaryOps.cpp index b26d2c4a419e5..48a735c3e5332 100644 --- a/aten/src/ATen/functorch/BatchRulesUnaryOps.cpp +++ b/aten/src/ATen/functorch/BatchRulesUnaryOps.cpp @@ -171,6 +171,8 @@ TORCH_LIBRARY_IMPL(aten, FuncTorchBatched, m) { POINTWISE_BOXED(fill_.Scalar); POINTWISE_BOXED(zero_); + // This is special because this op doesn't return anything + m.impl("_assert_tensor_metadata", native::_assert_tensor_metadata); #undef UNARY_POINTWISE #undef UNARY_POINTWISE_ALL diff --git a/aten/src/ATen/native/CPUBlas.cpp b/aten/src/ATen/native/CPUBlas.cpp index e06afddd05aa7..20be0d6fe017a 100644 --- a/aten/src/ATen/native/CPUBlas.cpp +++ b/aten/src/ATen/native/CPUBlas.cpp @@ -457,24 +457,9 @@ void gemm( return; } #endif - // for the fallback path, first compute gemm with beta = 0, - // and then add c in full precision. - int64_t c_size = n * m; - std::vector float_c(c_size, 0.f); gemm_no_downcast_stub( at::kCPU, at::kBFloat16, - transa, transb, m, n, k, alpha, a, lda, b, ldb, 0.f, float_c.data(), m); - for (const auto j : c10::irange(n)) { - for (const auto i : c10::irange(m)) { - auto offset = j * ldc + i; - // beta == 0 won't propagate NaN from C - if (beta == 0.f) { - c[offset] = float_c[j * m + i]; - } else { - c[offset] = beta * c[offset] + float_c[j * m + i]; - } - } - } + transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } void gemm( @@ -493,24 +478,9 @@ void gemm( return; } #endif - // for the fallback path, first compute gemm with beta = 0, - // and then add c in full precision. - int64_t c_size = n * m; - std::vector float_c(c_size, 0.f); gemm_no_downcast_stub( at::kCPU, at::kHalf, - transa, transb, m, n, k, alpha, a, lda, b, ldb, 0.f, float_c.data(), m); - for (const auto j : c10::irange(n)) { - for (const auto i : c10::irange(m)) { - auto offset = j * ldc + i; - // beta == 0 won't propagate NaN from C - if (beta == 0.f) { - c[offset] = float_c[j * m + i]; - } else { - c[offset] = beta * c[offset] + float_c[j * m + i]; - } - } - } + transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } void gemm( diff --git a/aten/src/ATen/native/ChanelShuffle.cpp b/aten/src/ATen/native/ChanelShuffle.cpp index 64fdd56c0e665..d043014b3820e 100644 --- a/aten/src/ATen/native/ChanelShuffle.cpp +++ b/aten/src/ATen/native/ChanelShuffle.cpp @@ -81,7 +81,7 @@ Tensor math_channel_shuffle(const Tensor& self, int64_t groups) { // TODO: contiguous can be made to preserve the memory format // of the input. However since the above reshape clobbers h and w // it may not be safe to do that, since channels_last contiguous - // may think oc and and the last dim correspond to h,w? + // may think oc and the last dim correspond to h,w? // It is not clear, however from initial looking around it feels that // this may not be correct. // In this case channels last will likely require custom implementation diff --git a/aten/src/ATen/native/EmbeddingBag.h b/aten/src/ATen/native/EmbeddingBag.h index eb29e1171dcd6..a344422204844 100644 --- a/aten/src/ATen/native/EmbeddingBag.h +++ b/aten/src/ATen/native/EmbeddingBag.h @@ -1,3 +1,4 @@ +#pragma once #include #include #include diff --git a/aten/src/ATen/native/Fill.cpp b/aten/src/ATen/native/Fill.cpp index 5ff1e6b61ed20..8e04a7490e879 100644 --- a/aten/src/ATen/native/Fill.cpp +++ b/aten/src/ATen/native/Fill.cpp @@ -97,43 +97,38 @@ Tensor& fill_diagonal_(Tensor& self, const Scalar& fill_value, bool wrap) { int64_t nDims = self.dim(); TORCH_CHECK(nDims >= 2, "dimensions must larger than 1"); - int64_t height = self.size(0); - int64_t width = self.size(1); + auto height = self.sym_size(0); + auto width = self.sym_size(1); if (nDims > 2) { - int64_t dim1 = height; for (const auto i : c10::irange(1, nDims)) { - if (self.size(i) != dim1) { + if (self.sym_size(i) != height) { TORCH_CHECK(false, "all dimensions of input must be of equal length"); } } } - int64_t storage_offset = self.storage_offset(); - std::vector sizes; - std::vector strides; - int64_t size = std::min(height, width); + auto storage_offset = self.sym_storage_offset(); + auto size = std::min(height, width); int64_t stride = 0; for (const auto i : c10::irange(nDims)) { stride += self.stride(i); } - strides.push_back(stride); - sizes.push_back(size); + std::vector strides{stride}; + std::vector sizes{size}; - auto main_diag = self.as_strided(sizes, strides, storage_offset); + auto main_diag = self.as_strided_symint(sizes, strides, storage_offset); main_diag.fill_(fill_value); if (wrap && nDims == 2 && height > width + 1) { - std::vector wrap_sizes; + auto step = width + 1; + auto wrap_size = ((self.numel() + step - 1) / step) - size; + std::vector wrap_sizes{wrap_size}; - int64_t step = width + 1; - int64_t wrap_size = ((self.numel() + step - 1) / step) - size; - wrap_sizes.push_back(wrap_size); + auto offset = self.stride(0) * (width + 1); - int64_t offset = self.stride(0) * (width + 1); - - auto wrap_diag = self.as_strided(wrap_sizes, strides, storage_offset + offset); + auto wrap_diag = self.as_strided_symint(wrap_sizes, strides, storage_offset + offset); wrap_diag.fill_(fill_value); } diff --git a/aten/src/ATen/native/FractionalMaxPool3d.cpp b/aten/src/ATen/native/FractionalMaxPool3d.cpp index d1fa7092f5f15..68328018b24b4 100644 --- a/aten/src/ATen/native/FractionalMaxPool3d.cpp +++ b/aten/src/ATen/native/FractionalMaxPool3d.cpp @@ -67,13 +67,13 @@ TORCH_PRECOMPUTE_META_FUNC(fractional_max_pool3d)( int64_t inputH = input_.size(heightDim); int64_t inputW = input_.size(widthDim); - TORCH_CHECK(outputT + poolSizeT - 1 < inputT, + TORCH_CHECK((poolSizeT <= inputT) && (outputT + poolSizeT - 1 < inputT), "fractional_max_pool3d_out(): pool time ", poolSizeT, " too large relative to input time ", inputT); - TORCH_CHECK(outputW + poolSizeW - 1 < inputW, + TORCH_CHECK((poolSizeW <= inputW) && (outputW + poolSizeW - 1 < inputW), "fractional_max_pool3d_out(): pool width ", poolSizeW, " too large relative to input width ", inputW); - TORCH_CHECK(outputH + poolSizeH - 1 < inputH, + TORCH_CHECK((poolSizeH <= inputH) && (outputH + poolSizeH - 1 < inputH), "fractional_max_pool3d_out(): pool height ", poolSizeH, " too large relative to input height ", inputH); diff --git a/aten/src/ATen/native/LinearAlgebra.cpp b/aten/src/ATen/native/LinearAlgebra.cpp index b62c584641dba..616e6ec60e13d 100644 --- a/aten/src/ATen/native/LinearAlgebra.cpp +++ b/aten/src/ATen/native/LinearAlgebra.cpp @@ -1360,7 +1360,8 @@ Tensor outer(const Tensor& self, const Tensor& vec2) { #endif -#if defined(__aarch64__) && AT_MKLDNN_ACL_ENABLED() +#if !defined(__aarch64__) || AT_MKLDNN_ACL_ENABLED() +// Used by default on x86 platforms and on AArch64+ACL static inline int64_t get_mkldnn_matmul_min_dim() { static auto value = [&] { const int64_t default_min_dim = [&] { @@ -1395,8 +1396,6 @@ static inline bool apply_mkldnn_matmul_heur(int64_t m, int64_t k, int64_t n) { return at::globalContext().userEnabledMkldnn() && m > min_dim && k > min_dim && n > min_dim && m * k * n > min_size; } #endif - - static void addmm_impl_cpu_( Tensor &result, const Tensor &self, Tensor m1, Tensor m2, const Scalar& beta, const Scalar& alpha) { TORCH_INTERNAL_ASSERT(self.dim() == 2 && m1.dim() == 2 && m2.dim() == 2); @@ -1772,8 +1771,8 @@ static inline void bmm_out_or_baddbmm_(const Tensor& self_or_result_, const Tens return (strides[2] == 1 && (sizes[1] == 1 || strides[1] >= sizes[2])) || (strides[1] == 1 && (sizes[2] == 1 || strides[2] >= sizes[1])); }; - -#if defined(__aarch64__) && AT_MKLDNN_ACL_ENABLED() +#if !defined(__aarch64__) || AT_MKLDNN_ACL_ENABLED() + // Always apply mkldnn heuristic on x86 platform, but on ARM only if compiled with ACL bool apply_heur = apply_mkldnn_matmul_heur(batch1.sizes()[1], batch1.sizes()[2], batch2.sizes()[2]); if (apply_heur && use_mkldnn_matmul(batch1, batch2, self_or_result)) { try { @@ -1785,7 +1784,6 @@ static inline void bmm_out_or_baddbmm_(const Tensor& self_or_result_, const Tens } } #endif - if (contraction_size * res_rows * res_cols < 400) { if (is_bmm_out) { AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(kBFloat16, kHalf, batch1.scalar_type(), "bmm", [&] { diff --git a/aten/src/ATen/native/MaxUnpooling.cpp b/aten/src/ATen/native/MaxUnpooling.cpp index a71db5e8ef8d1..f91b892efec21 100644 --- a/aten/src/ATen/native/MaxUnpooling.cpp +++ b/aten/src/ATen/native/MaxUnpooling.cpp @@ -23,8 +23,6 @@ Tensor& max_unpooling2d_forward_out_cpu( // Nondeterministic with duplicate indices at::globalContext().alertNotDeterministic("max_unpooling2d_forward_out"); - auto oheight = output_size[0]; - auto owidth = output_size[1]; TORCH_CHECK( indices_.scalar_type() == at::ScalarType::Long, "elements in indices should be type int64 but got: ", indices_.scalar_type()); @@ -45,6 +43,9 @@ Tensor& max_unpooling2d_forward_out_cpu( self_.sizes(), " with dimension ", i , " being empty."); } + auto oheight = output_size[0]; + auto owidth = output_size[1]; + auto memory_format = self_.suggest_memory_format(); auto self = self_.contiguous(memory_format); auto indices = indices_.contiguous(memory_format); diff --git a/aten/src/ATen/native/Normalization.cpp b/aten/src/ATen/native/Normalization.cpp index ac1086c6b6bd3..229d504b0a386 100644 --- a/aten/src/ATen/native/Normalization.cpp +++ b/aten/src/ATen/native/Normalization.cpp @@ -671,7 +671,9 @@ std::tuple _batch_norm_impl_index( std::cout << "PYTORCH_MIOPEN_EXTRA_LOGGING: ********************* _batch_norm_impl_index (calling miopen_batch_norm)" << std::endl; return std::tuple_cat( at::miopen_batch_norm( - input.contiguous(input.suggest_memory_format()), weight.contiguous(), bias.contiguous(), + input.contiguous(input.suggest_memory_format()), + weight.contiguous(), + bias.contiguous(), running_mean.defined() ? running_mean.contiguous() : running_mean, running_var.defined() ? running_var.contiguous() : running_var, training, momentum, eps), diff --git a/aten/src/ATen/native/PadNd.cpp b/aten/src/ATen/native/PadNd.cpp index 8099648d37b29..3c00a16108c12 100644 --- a/aten/src/ATen/native/PadNd.cpp +++ b/aten/src/ATen/native/PadNd.cpp @@ -73,7 +73,7 @@ Tensor constant_pad_nd(const Tensor& self, IntArrayRef pad, const Scalar& value) for (const auto i : c10::irange((size_t)l_pad)) { auto pad_idx = pad.size() - ((i + 1) * 2); auto new_dim = input_sizes[l_diff + i] + pad[pad_idx] + pad[pad_idx + 1]; - TORCH_CHECK(new_dim > 0, "The input size ", input_sizes[l_diff + i], ", plus negative padding ", + TORCH_CHECK(new_dim >= 0, "The input size ", input_sizes[l_diff + i], ", plus negative padding ", pad[pad_idx], " and ", pad[pad_idx + 1], " resulted in a negative output size, " "which is invalid. Check dimension ", l_diff + i, " of your input."); new_shape.emplace_back(new_dim); diff --git a/aten/src/ATen/native/TensorAdvancedIndexing.cpp b/aten/src/ATen/native/TensorAdvancedIndexing.cpp index 408faea1b7644..7d613fc023120 100644 --- a/aten/src/ATen/native/TensorAdvancedIndexing.cpp +++ b/aten/src/ATen/native/TensorAdvancedIndexing.cpp @@ -2174,7 +2174,7 @@ static void _scatter_via_index_put( if (self.dim() == 1 || broadcast_index) { Tensor squeezed = index; if (broadcast_index && index.dim() > 1) { - for (const auto d : c10::irange(index.dim())) { + for (int64_t d = index.dim() - 1; d >= 0; --d) { if (d == dim) { continue; } diff --git a/aten/src/ATen/native/TriangularOps.cpp b/aten/src/ATen/native/TriangularOps.cpp index 47264c45205c0..08b666e296ed7 100644 --- a/aten/src/ATen/native/TriangularOps.cpp +++ b/aten/src/ATen/native/TriangularOps.cpp @@ -52,6 +52,7 @@ void apply_triu_tril_single( int64_t self_col_stride, bool upper) { constexpr int64_t zero = 0; + k = std::clamp(k, -n, m); // Clamp k to [-n, m] to prevent i + k arithmetic overflow, especially if k approaches INT64_MAX/INT64_MIN. if (upper) { parallel_for(0, n, 0, [&](int64_t start, int64_t end) { diff --git a/aten/src/ATen/native/cpu/MaxUnpoolKernel.cpp b/aten/src/ATen/native/cpu/MaxUnpoolKernel.cpp index c775bc756145a..fca7d8bdce5ae 100644 --- a/aten/src/ATen/native/cpu/MaxUnpoolKernel.cpp +++ b/aten/src/ATen/native/cpu/MaxUnpoolKernel.cpp @@ -85,11 +85,11 @@ void cpu_max_unpool( if constexpr (is_3d) { TORCH_CHECK(false, "Found an invalid max index: ", optional_error_index.value(), " (output volumes are of size ", output_depth, - "x", output_height, "x", output_width); + "x", output_height, "x", output_width, ")"); } else { TORCH_CHECK(false, "Found an invalid max index: ", optional_error_index.value(), " (output volumes are of size ", output_height, - "x", output_width); + "x", output_width, ")"); } } diff --git a/aten/src/ATen/native/cuda/ActivationHardsigmoidKernel.cu b/aten/src/ATen/native/cuda/ActivationHardsigmoidKernel.cu index 8a3326fddb8a9..fcacef37ceaf0 100644 --- a/aten/src/ATen/native/cuda/ActivationHardsigmoidKernel.cu +++ b/aten/src/ATen/native/cuda/ActivationHardsigmoidKernel.cu @@ -36,7 +36,7 @@ void hardsigmoid_kernel(TensorIteratorBase& iter) { [zero, one_sixth, three, six] GPU_LAMBDA( scalar_t self_val) -> scalar_t { opmath_t x = static_cast(self_val); - return std::min(std::max(x + three, zero), six) * one_sixth; + return std::min(std::max(x + three, zero), six) * one_sixth; }); }); } diff --git a/aten/src/ATen/native/cuda/Blas.cpp b/aten/src/ATen/native/cuda/Blas.cpp index fcaae32e773f1..652dc8e121d0d 100644 --- a/aten/src/ATen/native/cuda/Blas.cpp +++ b/aten/src/ATen/native/cuda/Blas.cpp @@ -1081,16 +1081,6 @@ static bool _scaled_mm_allowed_device(bool sm90_only=false, bool sm100_only=fals #endif } -static bool _grouped_mm_allowed_device() { -#ifdef USE_ROCM - return false; -#else - auto dprops = at::cuda::getCurrentDeviceProperties(); - // CUDA capability 8.0 and greater - return dprops->major >= 8; -#endif -} - #ifdef USE_ROCM static bool _scaled_mm_is_fnuz() { return at::detail::getCUDAHooks().isGPUArch({"gfx942"}); @@ -1149,9 +1139,14 @@ bool is_blockwise_1x16_scaling(const at::Tensor& t, const at::Tensor& scale) { bool is_blockwise_1x32_scaling(const at::Tensor& t, const at::Tensor& scale) { // TODO: We might want to enforce some structure on the shapes of the scale // tensors - return (isFloat8Type(t.scalar_type()) && scale.scalar_type() == at::kFloat8_e8m0fnu - && scale.numel() == round_up(t.size(0), 128) * round_up(ceil_div(t.size(1), 32), 4) - && scale.is_contiguous()); + bool is_fp8_path = (isFloat8Type(t.scalar_type()) && scale.scalar_type() == at::kFloat8_e8m0fnu + && scale.numel() == round_up(t.size(0), 128) * round_up(ceil_div(t.size(1), 32), 4)); + bool is_packed_fp4_path = false; +#ifdef USE_ROCM + is_packed_fp4_path = (t.scalar_type() == ScalarType::Float4_e2m1fn_x2 && scale.scalar_type() == at::kFloat8_e8m0fnu + && scale.numel() == round_up(t.size(0), 128) * round_up(ceil_div(t.size(1) * 2, 32), 4)); +#endif + return (is_fp8_path || is_packed_fp4_path) && scale.is_contiguous(); } bool is_blockwise_1x128_scaling(const at::Tensor& t, const at::Tensor& scale) { @@ -1392,9 +1387,15 @@ _scaled_mm_out_cuda(const Tensor& mat1, const Tensor& mat2, TORCH_CHECK(at::detail::getCUDAHooks().isGPUArch({"gfx950"}), "Block-wise scaling for Float8_e8m0fnu is only supported on gfx950"); - TORCH_CHECK(mat1.size(0) % 32 == 0 && mat1.size(1) % 32 == 0 && - mat2.size(0) % 32 == 0 && mat2.size(1) % 32 == 0, - "Matrix dimensions must be multiples of 32 for block-wise scaling"); + int packed_factor = 1; + if (mat1.scalar_type() == ScalarType::Float4_e2m1fn_x2) { + // For float4 data type, each byte stores two 4-bit floating-point values, + // effectively packing two elements into one byte. + packed_factor = 2; + } + TORCH_CHECK(mat1.size(0) % 16 == 0 && (mat1.size(1) * packed_factor) % 128 == 0 && + mat2.size(1) % 16 == 0, + "M, N must be multiples of 16 and K must be multiple of 128 for block-wise scaling"); TORCH_CHECK(out.scalar_type() == ScalarType::BFloat16 || out.scalar_type() == ScalarType::Half, @@ -1787,14 +1788,19 @@ Tensor _grouped_mm_cuda(const Tensor& mat_a, const Tensor& mat_b, const std::optional& offs, const std::optional& bias, std::optional out_dtype) { -#ifndef USE_ROCM _grouped_mm_validate_inputs(mat_a, mat_b, offs, bias, out_dtype); bool a_b_and_out_are_bf16 = ( mat_a.dtype() == at::kBFloat16 && mat_b.dtype() == at::kBFloat16 && out_dtype.value_or(at::kBFloat16) == at::kBFloat16 ); +#ifndef USE_ROCM bool use_fast_path = _scaled_mm_allowed_device(/*sm90_only*/true, /*sm100_only*/true) && a_b_and_out_are_bf16; +#else + // _scaled_mm_allowed_device is used here within _grouped_mm_cuda which seems incorrect since scale is not used. + // the _grouped_mm_fallback should be safe for any ROCm GPU since it's just calling typical mm/bmm + bool use_fast_path = false; +#endif const auto out_dtype_ = _resolve_grouped_mm_out_dtype(mat_a, mat_b, out_dtype); Tensor out = create_grouped_gemm_output_tensor(mat_a, mat_b, offs, out_dtype_); if (use_fast_path) { @@ -1804,9 +1810,6 @@ std::optional out_dtype) { _grouped_mm_fallback(mat_a, mat_b, offs, bias, out_dtype, out); } return out; -#else - TORCH_CHECK(false, "grouped gemm is not supported on ROCM") -#endif } Tensor _bmm_dtype_cuda(const Tensor& batch1, const Tensor& batch2, const at::ScalarType out_dtype) { diff --git a/aten/src/ATen/native/cuda/ForeachBinaryOpList.cu b/aten/src/ATen/native/cuda/ForeachBinaryOpList.cu index 7ee02b02b41f1..227d42247ebd9 100644 --- a/aten/src/ATen/native/cuda/ForeachBinaryOpList.cu +++ b/aten/src/ATen/native/cuda/ForeachBinaryOpList.cu @@ -51,7 +51,7 @@ std::vector foreach_tensor_list_op( Op(), alpha.to()); - return tensor_lists[2]; + return std::move(tensor_lists[2]); } template class Op> diff --git a/aten/src/ATen/native/cuda/ForeachBinaryOpScalar.cu b/aten/src/ATen/native/cuda/ForeachBinaryOpScalar.cu index 80d748dd3579b..9ac0e875b2d68 100644 --- a/aten/src/ATen/native/cuda/ForeachBinaryOpScalar.cu +++ b/aten/src/ATen/native/cuda/ForeachBinaryOpScalar.cu @@ -45,7 +45,7 @@ std::vector foreach_binary_op( /* res_arg_index */ 1>(), Op(), scalar.to()); - return tensor_lists[1]; + return std::move(tensor_lists[1]); } template class Op> diff --git a/aten/src/ATen/native/cuda/ForeachBinaryOpScalarList.cu b/aten/src/ATen/native/cuda/ForeachBinaryOpScalarList.cu index dcb93188b5e69..b28aa690630b4 100644 --- a/aten/src/ATen/native/cuda/ForeachBinaryOpScalarList.cu +++ b/aten/src/ATen/native/cuda/ForeachBinaryOpScalarList.cu @@ -33,7 +33,7 @@ std::vector foreach_binary_op( } tensor_lists.emplace_back(tensors.vec()); - tensor_lists.emplace_back(vec_res); + tensor_lists.emplace_back(std::move(vec_res)); using opmath_t = at::opmath_type; multi_tensor_apply<2, opmath_t>( @@ -46,7 +46,7 @@ std::vector foreach_binary_op( /* res_arg_index */ 1>(), Op()); - return tensor_lists[1]; + return std::move(tensor_lists[1]); } template class Op> diff --git a/aten/src/ATen/native/cuda/ForeachBinaryOpScalarTensor.cu b/aten/src/ATen/native/cuda/ForeachBinaryOpScalarTensor.cu index ad5eeee5ebec4..bc6bd37891258 100644 --- a/aten/src/ATen/native/cuda/ForeachBinaryOpScalarTensor.cu +++ b/aten/src/ATen/native/cuda/ForeachBinaryOpScalarTensor.cu @@ -56,7 +56,7 @@ std::vector foreach_binary_op( Op(), scalar.data_ptr(), alpha.to()); - return tensor_lists[1]; + return std::move(tensor_lists[1]); } template class Op> diff --git a/aten/src/ATen/native/cuda/ForeachPointwiseOp.cu b/aten/src/ATen/native/cuda/ForeachPointwiseOp.cu index 7a3276c44750a..7f563f55d5565 100644 --- a/aten/src/ATen/native/cuda/ForeachPointwiseOp.cu +++ b/aten/src/ATen/native/cuda/ForeachPointwiseOp.cu @@ -57,7 +57,7 @@ std::vector foreach_pointwise_op( scalar.to()); }); - return tensor_lists[3]; + return std::move(tensor_lists[3]); } template