diff --git a/.ci/aarch64_linux/README.md b/.ci/aarch64_linux/README.md deleted file mode 100644 index 583ed4af99844..0000000000000 --- a/.ci/aarch64_linux/README.md +++ /dev/null @@ -1,19 +0,0 @@ -# Aarch64 (ARM/Graviton) Support Scripts -Scripts for building aarch64 PyTorch PIP Wheels. These scripts build the following wheels: -* torch -* torchvision -* torchaudio -* torchtext -* torchdata -## Aarch64_ci_build.sh -This script is design to support CD operations within PyPi manylinux aarch64 container, and be executed in the container. It prepares the container and then executes __aarch64_wheel_ci_build.py__ to build the wheels. The script "assumes" the PyTorch repo is located at: ```/pytorch``` and will put the wheels into ```/artifacts```. -### Usage -```DESIRED_PYTHON= aarch64_ci_build.sh``` - -__NOTE:__ CI build is currently __EXPERMINTAL__ - -## Build_aarch64_wheel.py -This app allows a person to build using AWS EC3 resources and requires AWS-CLI and Boto3 with AWS credentials to support building EC2 instances for the wheel builds. Can be used in a codebuild CD or from a local system. - -### Usage -```build_aarch64_wheel.py --key-name --use-docker --python 3.8 --branch ``` diff --git a/.ci/aarch64_linux/aarch64_ci_build.sh b/.ci/aarch64_linux/aarch64_ci_build.sh deleted file mode 100644 index b25f3b21e8eb1..0000000000000 --- a/.ci/aarch64_linux/aarch64_ci_build.sh +++ /dev/null @@ -1,53 +0,0 @@ -#!/bin/bash -set -eux -o pipefail - -GPU_ARCH_VERSION=${GPU_ARCH_VERSION:-} - -# Set CUDA architecture lists to match x86 build_cuda.sh -if [[ "$GPU_ARCH_VERSION" == *"12.6"* ]]; then - export TORCH_CUDA_ARCH_LIST="8.0;9.0" -elif [[ "$GPU_ARCH_VERSION" == *"12.8"* ]]; then - export TORCH_CUDA_ARCH_LIST="8.0;9.0;10.0;12.0" -elif [[ "$GPU_ARCH_VERSION" == *"12.9"* ]]; then - export TORCH_CUDA_ARCH_LIST="8.0;9.0;10.0;12.0" -elif [[ "$GPU_ARCH_VERSION" == *"13.0"* ]]; then - export TORCH_CUDA_ARCH_LIST="8.0;9.0;10.0;11.0;12.0+PTX" -fi - -# Compress the fatbin with -compress-mode=size for CUDA 13 -if [[ "$DESIRED_CUDA" == *"13"* ]]; then - export TORCH_NVCC_FLAGS="-compress-mode=size" - # Bundle ptxas into the cu13 wheel, see https://github.com/pytorch/pytorch/issues/163801 - export BUILD_BUNDLE_PTXAS=1 -fi - -SCRIPTPATH="$( cd -- "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P )" -source $SCRIPTPATH/aarch64_ci_setup.sh - -############################################################################### -# Run aarch64 builder python -############################################################################### -cd / -# adding safe directory for git as the permissions will be -# on the mounted pytorch repo -git config --global --add safe.directory /pytorch -pip install -r /pytorch/requirements.txt -pip install auditwheel==6.2.0 wheel -if [ "$DESIRED_CUDA" = "cpu" ]; then - echo "BASE_CUDA_VERSION is not set. Building cpu wheel." - python /pytorch/.ci/aarch64_linux/aarch64_wheel_ci_build.py --enable-mkldnn -else - echo "BASE_CUDA_VERSION is set to: $DESIRED_CUDA" - export USE_SYSTEM_NCCL=1 - - # Check if we should use NVIDIA libs from PyPI (similar to x86 build_cuda.sh logic) - if [[ -z "$PYTORCH_EXTRA_INSTALL_REQUIREMENTS" ]]; then - echo "Bundling CUDA libraries with wheel for aarch64." - else - echo "Using nvidia libs from pypi for aarch64." - echo "Updated PYTORCH_EXTRA_INSTALL_REQUIREMENTS for aarch64: $PYTORCH_EXTRA_INSTALL_REQUIREMENTS" - export USE_NVIDIA_PYPI_LIBS=1 - fi - - python /pytorch/.ci/aarch64_linux/aarch64_wheel_ci_build.py --enable-mkldnn --enable-cuda -fi diff --git a/.ci/aarch64_linux/aarch64_ci_setup.sh b/.ci/aarch64_linux/aarch64_ci_setup.sh deleted file mode 100755 index 8ffba65d7fedd..0000000000000 --- a/.ci/aarch64_linux/aarch64_ci_setup.sh +++ /dev/null @@ -1,21 +0,0 @@ -#!/bin/bash -set -eux -o pipefail - -# This script is used to prepare the Docker container for aarch64_ci_wheel_build.py python script -# By creating symlinks from desired /opt/python to /usr/local/bin/ - -NUMPY_VERSION=2.0.2 -if [[ "$DESIRED_PYTHON" == "3.13" || "$DESIRED_PYTHON" == "3.13t" ]]; then - NUMPY_VERSION=2.1.2 -fi - -SCRIPTPATH="$( cd "$(dirname "$0")" ; pwd -P )" -source $SCRIPTPATH/../manywheel/set_desired_python.sh - -pip install -q numpy==${NUMPY_VERSION} pyyaml==6.0.2 scons==4.7.0 ninja==1.11.1 patchelf==0.17.2 - -for tool in python python3 pip pip3 ninja scons patchelf; do - ln -sf ${DESIRED_PYTHON_BIN_DIR}/${tool} /usr/local/bin; -done - -python --version diff --git a/.ci/aarch64_linux/aarch64_wheel_ci_build.py b/.ci/aarch64_linux/aarch64_wheel_ci_build.py deleted file mode 100755 index a99e5f8f65659..0000000000000 --- a/.ci/aarch64_linux/aarch64_wheel_ci_build.py +++ /dev/null @@ -1,333 +0,0 @@ -#!/usr/bin/env python3 -# encoding: UTF-8 - -import os -import shutil -from subprocess import check_call, check_output - - -def list_dir(path: str) -> list[str]: - """' - Helper for getting paths for Python - """ - return check_output(["ls", "-1", path]).decode().split("\n") - - -def replace_tag(filename) -> None: - with open(filename) as f: - lines = f.readlines() - for i, line in enumerate(lines): - if line.startswith("Tag:"): - lines[i] = line.replace("-linux_", "-manylinux_2_28_") - print(f"Updated tag from {line} to {lines[i]}") - break - - with open(filename, "w") as f: - f.writelines(lines) - - -def patch_library_rpath( - folder: str, - lib_name: str, - use_nvidia_pypi_libs: bool = False, - desired_cuda: str = "", -) -> None: - """Apply patchelf to set RPATH for a library in torch/lib""" - lib_path = f"{folder}/tmp/torch/lib/{lib_name}" - - if use_nvidia_pypi_libs: - # For PyPI NVIDIA libraries, construct CUDA RPATH - cuda_rpaths = [ - "$ORIGIN/../../nvidia/cudnn/lib", - "$ORIGIN/../../nvidia/nvshmem/lib", - "$ORIGIN/../../nvidia/nccl/lib", - "$ORIGIN/../../nvidia/cusparselt/lib", - ] - - if "130" in desired_cuda: - cuda_rpaths.append("$ORIGIN/../../nvidia/cu13/lib") - else: - cuda_rpaths.extend( - [ - "$ORIGIN/../../nvidia/cublas/lib", - "$ORIGIN/../../nvidia/cuda_cupti/lib", - "$ORIGIN/../../nvidia/cuda_nvrtc/lib", - "$ORIGIN/../../nvidia/cuda_runtime/lib", - "$ORIGIN/../../nvidia/cufft/lib", - "$ORIGIN/../../nvidia/curand/lib", - "$ORIGIN/../../nvidia/cusolver/lib", - "$ORIGIN/../../nvidia/cusparse/lib", - "$ORIGIN/../../nvidia/nvtx/lib", - "$ORIGIN/../../nvidia/cufile/lib", - ] - ) - - # Add $ORIGIN for local torch libs - rpath = ":".join(cuda_rpaths) + ":$ORIGIN" - else: - # For bundled libraries, just use $ORIGIN - rpath = "$ORIGIN" - - if os.path.exists(lib_path): - os.system( - f"cd {folder}/tmp/torch/lib/; " - f"patchelf --set-rpath '{rpath}' --force-rpath {lib_name}" - ) - - -def copy_and_patch_library( - src_path: str, - folder: str, - use_nvidia_pypi_libs: bool = False, - desired_cuda: str = "", -) -> None: - """Copy a library to torch/lib and patch its RPATH""" - if os.path.exists(src_path): - lib_name = os.path.basename(src_path) - shutil.copy2(src_path, f"{folder}/tmp/torch/lib/{lib_name}") - patch_library_rpath(folder, lib_name, use_nvidia_pypi_libs, desired_cuda) - - -def package_cuda_wheel(wheel_path, desired_cuda) -> None: - """ - Package the cuda wheel libraries - """ - folder = os.path.dirname(wheel_path) - os.mkdir(f"{folder}/tmp") - os.system(f"unzip {wheel_path} -d {folder}/tmp") - # Delete original wheel since it will be repackaged - os.system(f"rm {wheel_path}") - - # Check if we should use PyPI NVIDIA libraries or bundle system libraries - use_nvidia_pypi_libs = os.getenv("USE_NVIDIA_PYPI_LIBS", "0") == "1" - - if use_nvidia_pypi_libs: - print("Using nvidia libs from pypi - skipping CUDA library bundling") - # For PyPI approach, we don't bundle CUDA libraries - they come from PyPI packages - # We only need to bundle non-NVIDIA libraries - minimal_libs_to_copy = [ - "/lib64/libgomp.so.1", - "/usr/lib64/libgfortran.so.5", - "/acl/build/libarm_compute.so", - "/acl/build/libarm_compute_graph.so", - "/usr/local/lib/libnvpl_lapack_lp64_gomp.so.0", - "/usr/local/lib/libnvpl_blas_lp64_gomp.so.0", - "/usr/local/lib/libnvpl_lapack_core.so.0", - "/usr/local/lib/libnvpl_blas_core.so.0", - ] - - # Copy minimal libraries to unzipped_folder/torch/lib - for lib_path in minimal_libs_to_copy: - copy_and_patch_library(lib_path, folder, use_nvidia_pypi_libs, desired_cuda) - - # Patch torch libraries used for searching libraries - torch_libs_to_patch = [ - "libtorch.so", - "libtorch_cpu.so", - "libtorch_cuda.so", - "libtorch_cuda_linalg.so", - "libtorch_global_deps.so", - "libtorch_python.so", - "libtorch_nvshmem.so", - "libc10.so", - "libc10_cuda.so", - "libcaffe2_nvrtc.so", - "libshm.so", - ] - for lib_name in torch_libs_to_patch: - patch_library_rpath(folder, lib_name, use_nvidia_pypi_libs, desired_cuda) - else: - print("Bundling CUDA libraries with wheel") - # Original logic for bundling system CUDA libraries - # Common libraries for all CUDA versions - common_libs = [ - # Non-NVIDIA system libraries - "/lib64/libgomp.so.1", - "/usr/lib64/libgfortran.so.5", - "/acl/build/libarm_compute.so", - "/acl/build/libarm_compute_graph.so", - # Common CUDA libraries (same for all versions) - "/usr/local/lib/libnvpl_lapack_lp64_gomp.so.0", - "/usr/local/lib/libnvpl_blas_lp64_gomp.so.0", - "/usr/local/lib/libnvpl_lapack_core.so.0", - "/usr/local/lib/libnvpl_blas_core.so.0", - "/usr/local/cuda/extras/CUPTI/lib64/libnvperf_host.so", - "/usr/local/cuda/lib64/libcudnn.so.9", - "/usr/local/cuda/lib64/libcusparseLt.so.0", - "/usr/local/cuda/lib64/libcurand.so.10", - "/usr/local/cuda/lib64/libnccl.so.2", - "/usr/local/cuda/lib64/libnvshmem_host.so.3", - "/usr/local/cuda/lib64/libcudnn_adv.so.9", - "/usr/local/cuda/lib64/libcudnn_cnn.so.9", - "/usr/local/cuda/lib64/libcudnn_graph.so.9", - "/usr/local/cuda/lib64/libcudnn_ops.so.9", - "/usr/local/cuda/lib64/libcudnn_engines_runtime_compiled.so.9", - "/usr/local/cuda/lib64/libcudnn_engines_precompiled.so.9", - "/usr/local/cuda/lib64/libcudnn_heuristic.so.9", - "/usr/local/cuda/lib64/libcufile.so.0", - "/usr/local/cuda/lib64/libcufile_rdma.so.1", - "/usr/local/cuda/lib64/libcusparse.so.12", - ] - - # CUDA version-specific libraries - if "13" in desired_cuda: - minor_version = desired_cuda[-1] - version_specific_libs = [ - "/usr/local/cuda/extras/CUPTI/lib64/libcupti.so.13", - "/usr/local/cuda/lib64/libcublas.so.13", - "/usr/local/cuda/lib64/libcublasLt.so.13", - "/usr/local/cuda/lib64/libcudart.so.13", - "/usr/local/cuda/lib64/libcufft.so.12", - "/usr/local/cuda/lib64/libcusolver.so.12", - "/usr/local/cuda/lib64/libnvJitLink.so.13", - "/usr/local/cuda/lib64/libnvrtc.so.13", - f"/usr/local/cuda/lib64/libnvrtc-builtins.so.13.{minor_version}", - ] - elif "12" in desired_cuda: - # Get the last character for libnvrtc-builtins version (e.g., "129" -> "9") - minor_version = desired_cuda[-1] - version_specific_libs = [ - "/usr/local/cuda/extras/CUPTI/lib64/libcupti.so.12", - "/usr/local/cuda/lib64/libcublas.so.12", - "/usr/local/cuda/lib64/libcublasLt.so.12", - "/usr/local/cuda/lib64/libcudart.so.12", - "/usr/local/cuda/lib64/libcufft.so.11", - "/usr/local/cuda/lib64/libcusolver.so.11", - "/usr/local/cuda/lib64/libnvJitLink.so.12", - "/usr/local/cuda/lib64/libnvrtc.so.12", - f"/usr/local/cuda/lib64/libnvrtc-builtins.so.12.{minor_version}", - ] - else: - raise ValueError(f"Unsupported CUDA version: {desired_cuda}.") - - # Combine all libraries - libs_to_copy = common_libs + version_specific_libs - - # Copy libraries to unzipped_folder/torch/lib - for lib_path in libs_to_copy: - copy_and_patch_library(lib_path, folder, use_nvidia_pypi_libs, desired_cuda) - - # Make sure the wheel is tagged with manylinux_2_28 - for f in os.scandir(f"{folder}/tmp/"): - if f.is_dir() and f.name.endswith(".dist-info"): - replace_tag(f"{f.path}/WHEEL") - break - - os.system(f"wheel pack {folder}/tmp/ -d {folder}") - os.system(f"rm -rf {folder}/tmp/") - - -def complete_wheel(folder: str) -> str: - """ - Complete wheel build and put in artifact location - """ - wheel_name = list_dir(f"/{folder}/dist")[0] - - # Please note for cuda we don't run auditwheel since we use custom script to package - # the cuda dependencies to the wheel file using update_wheel() method. - # However we need to make sure filename reflects the correct Manylinux platform. - if "pytorch" in folder and not enable_cuda: - print("Repairing Wheel with AuditWheel") - check_call(["auditwheel", "repair", f"dist/{wheel_name}"], cwd=folder) - repaired_wheel_name = list_dir(f"/{folder}/wheelhouse")[0] - - print(f"Moving {repaired_wheel_name} wheel to /{folder}/dist") - os.rename( - f"/{folder}/wheelhouse/{repaired_wheel_name}", - f"/{folder}/dist/{repaired_wheel_name}", - ) - else: - repaired_wheel_name = list_dir(f"/{folder}/dist")[0] - - print(f"Copying {repaired_wheel_name} to artifacts") - shutil.copy2( - f"/{folder}/dist/{repaired_wheel_name}", f"/artifacts/{repaired_wheel_name}" - ) - - return repaired_wheel_name - - -def parse_arguments(): - """ - Parse inline arguments - """ - from argparse import ArgumentParser - - parser = ArgumentParser("AARCH64 wheels python CD") - parser.add_argument("--debug", action="store_true") - parser.add_argument("--build-only", action="store_true") - parser.add_argument("--test-only", type=str) - parser.add_argument("--enable-mkldnn", action="store_true") - parser.add_argument("--enable-cuda", action="store_true") - return parser.parse_args() - - -if __name__ == "__main__": - """ - Entry Point - """ - args = parse_arguments() - enable_mkldnn = args.enable_mkldnn - enable_cuda = args.enable_cuda - branch = check_output( - ["git", "rev-parse", "--abbrev-ref", "HEAD"], cwd="/pytorch" - ).decode() - - print("Building PyTorch wheel") - build_vars = "" - # MAX_JOB=5 is not required for CPU backend (see commit 465d98b) - if enable_cuda: - build_vars += "MAX_JOBS=5 " - - # Handle PyPI NVIDIA libraries vs bundled libraries - use_nvidia_pypi_libs = os.getenv("USE_NVIDIA_PYPI_LIBS", "0") == "1" - if use_nvidia_pypi_libs: - print("Configuring build for PyPI NVIDIA libraries") - # Configure for dynamic linking (matching x86 logic) - build_vars += "ATEN_STATIC_CUDA=0 USE_CUDA_STATIC_LINK=0 USE_CUPTI_SO=1 " - else: - print("Configuring build for bundled NVIDIA libraries") - # Keep existing static linking approach - already configured above - - override_package_version = os.getenv("OVERRIDE_PACKAGE_VERSION") - desired_cuda = os.getenv("DESIRED_CUDA") - if override_package_version is not None: - version = override_package_version - build_vars += ( - f"BUILD_TEST=0 PYTORCH_BUILD_VERSION={version} PYTORCH_BUILD_NUMBER=1 " - ) - elif branch in ["nightly", "main"]: - build_date = ( - check_output(["git", "log", "--pretty=format:%cs", "-1"], cwd="/pytorch") - .decode() - .replace("-", "") - ) - version = ( - check_output(["cat", "version.txt"], cwd="/pytorch").decode().strip()[:-2] - ) - if enable_cuda: - build_vars += f"BUILD_TEST=0 PYTORCH_BUILD_VERSION={version}.dev{build_date}+{desired_cuda} PYTORCH_BUILD_NUMBER=1 " - else: - build_vars += f"BUILD_TEST=0 PYTORCH_BUILD_VERSION={version}.dev{build_date} PYTORCH_BUILD_NUMBER=1 " - elif branch.startswith(("v1.", "v2.")): - build_vars += f"BUILD_TEST=0 PYTORCH_BUILD_VERSION={branch[1 : branch.find('-')]} PYTORCH_BUILD_NUMBER=1 " - - if enable_mkldnn: - print("build pytorch with mkldnn+acl backend") - build_vars += "USE_MKLDNN=ON USE_MKLDNN_ACL=ON " - build_vars += "ACL_ROOT_DIR=/acl " - if enable_cuda: - build_vars += "BLAS=NVPL " - else: - build_vars += "BLAS=OpenBLAS OpenBLAS_HOME=/opt/OpenBLAS " - else: - print("build pytorch without mkldnn backend") - - os.system(f"cd /pytorch; {build_vars} python3 -m build --wheel --no-isolation") - if enable_cuda: - print("Updating Cuda Dependency") - filename = os.listdir("/pytorch/dist/") - wheel_path = f"/pytorch/dist/{filename[0]}" - package_cuda_wheel(wheel_path, desired_cuda) - pytorch_wheel_name = complete_wheel("/pytorch/") - print(f"Build Complete. Created {pytorch_wheel_name}..") diff --git a/.ci/aarch64_linux/build_aarch64_wheel.py b/.ci/aarch64_linux/build_aarch64_wheel.py deleted file mode 100755 index a157ec57b574a..0000000000000 --- a/.ci/aarch64_linux/build_aarch64_wheel.py +++ /dev/null @@ -1,999 +0,0 @@ -#!/usr/bin/env python3 - -# This script is for building AARCH64 wheels using AWS EC2 instances. -# To generate binaries for the release follow these steps: -# 1. Update mappings for each of the Domain Libraries by adding new row to a table like this: -# "v1.11.0": ("0.11.0", "rc1"), -# 2. Run script with following arguments for each of the supported python versions and required tag, for example: -# build_aarch64_wheel.py --key-name --use-docker --python 3.8 --branch v1.11.0-rc3 - - -import os -import subprocess -import sys -import time -from typing import Optional, Union - -import boto3 - - -# AMI images for us-east-1, change the following based on your ~/.aws/config -os_amis = { - "ubuntu20_04": "ami-052eac90edaa9d08f", # login_name: ubuntu - "ubuntu22_04": "ami-0c6c29c5125214c77", # login_name: ubuntu - "redhat8": "ami-0698b90665a2ddcf1", # login_name: ec2-user -} - -ubuntu20_04_ami = os_amis["ubuntu20_04"] - - -def compute_keyfile_path(key_name: Optional[str] = None) -> tuple[str, str]: - if key_name is None: - key_name = os.getenv("AWS_KEY_NAME") - if key_name is None: - return os.getenv("SSH_KEY_PATH", ""), "" - - homedir_path = os.path.expanduser("~") - default_path = os.path.join(homedir_path, ".ssh", f"{key_name}.pem") - return os.getenv("SSH_KEY_PATH", default_path), key_name - - -ec2 = boto3.resource("ec2") - - -def ec2_get_instances(filter_name, filter_value): - return ec2.instances.filter( - Filters=[{"Name": filter_name, "Values": [filter_value]}] - ) - - -def ec2_instances_of_type(instance_type="t4g.2xlarge"): - return ec2_get_instances("instance-type", instance_type) - - -def ec2_instances_by_id(instance_id): - rc = list(ec2_get_instances("instance-id", instance_id)) - return rc[0] if len(rc) > 0 else None - - -def start_instance( - key_name, ami=ubuntu20_04_ami, instance_type="t4g.2xlarge", ebs_size: int = 50 -): - inst = ec2.create_instances( - ImageId=ami, - InstanceType=instance_type, - SecurityGroups=["ssh-allworld"], - KeyName=key_name, - MinCount=1, - MaxCount=1, - BlockDeviceMappings=[ - { - "DeviceName": "/dev/sda1", - "Ebs": { - "DeleteOnTermination": True, - "VolumeSize": ebs_size, - "VolumeType": "standard", - }, - } - ], - )[0] - print(f"Create instance {inst.id}") - inst.wait_until_running() - running_inst = ec2_instances_by_id(inst.id) - print(f"Instance started at {running_inst.public_dns_name}") - return running_inst - - -class RemoteHost: - addr: str - keyfile_path: str - login_name: str - container_id: Optional[str] = None - ami: Optional[str] = None - - def __init__(self, addr: str, keyfile_path: str, login_name: str = "ubuntu"): - self.addr = addr - self.keyfile_path = keyfile_path - self.login_name = login_name - - def _gen_ssh_prefix(self) -> list[str]: - return [ - "ssh", - "-o", - "StrictHostKeyChecking=no", - "-i", - self.keyfile_path, - f"{self.login_name}@{self.addr}", - "--", - ] - - @staticmethod - def _split_cmd(args: Union[str, list[str]]) -> list[str]: - return args.split() if isinstance(args, str) else args - - def run_ssh_cmd(self, args: Union[str, list[str]]) -> None: - subprocess.check_call(self._gen_ssh_prefix() + self._split_cmd(args)) - - def check_ssh_output(self, args: Union[str, list[str]]) -> str: - return subprocess.check_output( - self._gen_ssh_prefix() + self._split_cmd(args) - ).decode("utf-8") - - def scp_upload_file(self, local_file: str, remote_file: str) -> None: - subprocess.check_call( - [ - "scp", - "-i", - self.keyfile_path, - local_file, - f"{self.login_name}@{self.addr}:{remote_file}", - ] - ) - - def scp_download_file( - self, remote_file: str, local_file: Optional[str] = None - ) -> None: - if local_file is None: - local_file = "." - subprocess.check_call( - [ - "scp", - "-i", - self.keyfile_path, - f"{self.login_name}@{self.addr}:{remote_file}", - local_file, - ] - ) - - def start_docker(self, image="quay.io/pypa/manylinux2014_aarch64:latest") -> None: - self.run_ssh_cmd("sudo apt-get install -y docker.io") - self.run_ssh_cmd(f"sudo usermod -a -G docker {self.login_name}") - self.run_ssh_cmd("sudo service docker start") - self.run_ssh_cmd(f"docker pull {image}") - self.container_id = self.check_ssh_output( - f"docker run -t -d -w /root {image}" - ).strip() - - def using_docker(self) -> bool: - return self.container_id is not None - - def run_cmd(self, args: Union[str, list[str]]) -> None: - if not self.using_docker(): - return self.run_ssh_cmd(args) - assert self.container_id is not None - docker_cmd = self._gen_ssh_prefix() + [ - "docker", - "exec", - "-i", - self.container_id, - "bash", - ] - p = subprocess.Popen(docker_cmd, stdin=subprocess.PIPE) - p.communicate( - input=" ".join(["source .bashrc && "] + self._split_cmd(args)).encode( - "utf-8" - ) - ) - rc = p.wait() - if rc != 0: - raise subprocess.CalledProcessError(rc, docker_cmd) - - def check_output(self, args: Union[str, list[str]]) -> str: - if not self.using_docker(): - return self.check_ssh_output(args) - assert self.container_id is not None - docker_cmd = self._gen_ssh_prefix() + [ - "docker", - "exec", - "-i", - self.container_id, - "bash", - ] - p = subprocess.Popen(docker_cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE) - (out, err) = p.communicate( - input=" ".join(["source .bashrc && "] + self._split_cmd(args)).encode( - "utf-8" - ) - ) - rc = p.wait() - if rc != 0: - raise subprocess.CalledProcessError(rc, docker_cmd, output=out, stderr=err) - return out.decode("utf-8") - - def upload_file(self, local_file: str, remote_file: str) -> None: - if not self.using_docker(): - return self.scp_upload_file(local_file, remote_file) - tmp_file = os.path.join("/tmp", os.path.basename(local_file)) - self.scp_upload_file(local_file, tmp_file) - self.run_ssh_cmd( - ["docker", "cp", tmp_file, f"{self.container_id}:/root/{remote_file}"] - ) - self.run_ssh_cmd(["rm", tmp_file]) - - def download_file(self, remote_file: str, local_file: Optional[str] = None) -> None: - if not self.using_docker(): - return self.scp_download_file(remote_file, local_file) - tmp_file = os.path.join("/tmp", os.path.basename(remote_file)) - self.run_ssh_cmd( - ["docker", "cp", f"{self.container_id}:/root/{remote_file}", tmp_file] - ) - self.scp_download_file(tmp_file, local_file) - self.run_ssh_cmd(["rm", tmp_file]) - - def download_wheel( - self, remote_file: str, local_file: Optional[str] = None - ) -> None: - if self.using_docker() and local_file is None: - basename = os.path.basename(remote_file) - local_file = basename.replace( - "-linux_aarch64.whl", "-manylinux2014_aarch64.whl" - ) - self.download_file(remote_file, local_file) - - def list_dir(self, path: str) -> list[str]: - return self.check_output(["ls", "-1", path]).split("\n") - - -def wait_for_connection(addr, port, timeout=15, attempt_cnt=5): - import socket - - for i in range(attempt_cnt): - try: - with socket.create_connection((addr, port), timeout=timeout): - return - except (ConnectionRefusedError, TimeoutError): # noqa: PERF203 - if i == attempt_cnt - 1: - raise - time.sleep(timeout) - - -def update_apt_repo(host: RemoteHost) -> None: - time.sleep(5) - host.run_cmd("sudo systemctl stop apt-daily.service || true") - host.run_cmd("sudo systemctl stop unattended-upgrades.service || true") - host.run_cmd( - "while systemctl is-active --quiet apt-daily.service; do sleep 1; done" - ) - host.run_cmd( - "while systemctl is-active --quiet unattended-upgrades.service; do sleep 1; done" - ) - host.run_cmd("sudo apt-get update") - time.sleep(3) - host.run_cmd("sudo apt-get update") - - -def install_condaforge( - host: RemoteHost, suffix: str = "latest/download/Miniforge3-Linux-aarch64.sh" -) -> None: - print("Install conda-forge") - host.run_cmd(f"curl -OL https://github.com/conda-forge/miniforge/releases/{suffix}") - host.run_cmd(f"sh -f {os.path.basename(suffix)} -b") - host.run_cmd(f"rm -f {os.path.basename(suffix)}") - if host.using_docker(): - host.run_cmd("echo 'PATH=$HOME/miniforge3/bin:$PATH'>>.bashrc") - else: - host.run_cmd( - [ - "sed", - "-i", - "'/^# If not running interactively.*/i PATH=$HOME/miniforge3/bin:$PATH'", - ".bashrc", - ] - ) - - -def install_condaforge_python(host: RemoteHost, python_version="3.8") -> None: - if python_version == "3.6": - # Python-3.6 EOLed and not compatible with conda-4.11 - install_condaforge( - host, suffix="download/4.10.3-10/Miniforge3-4.10.3-10-Linux-aarch64.sh" - ) - host.run_cmd(f"conda install -y python={python_version} numpy pyyaml") - else: - install_condaforge( - host, suffix="download/4.11.0-4/Miniforge3-4.11.0-4-Linux-aarch64.sh" - ) - # Pytorch-1.10 or older are not compatible with setuptools=59.6 or newer - host.run_cmd( - f"conda install -y python={python_version} numpy pyyaml setuptools>=59.5.0" - ) - - -def embed_libgomp(host: RemoteHost, use_conda, wheel_name) -> None: - host.run_cmd("pip3 install auditwheel") - host.run_cmd( - "conda install -y patchelf" if use_conda else "sudo apt-get install -y patchelf" - ) - from tempfile import NamedTemporaryFile - - with NamedTemporaryFile() as tmp: - tmp.write(embed_library_script.encode("utf-8")) - tmp.flush() - host.upload_file(tmp.name, "embed_library.py") - - print("Embedding libgomp into wheel") - if host.using_docker(): - host.run_cmd(f"python3 embed_library.py {wheel_name} --update-tag") - else: - host.run_cmd(f"python3 embed_library.py {wheel_name}") - - -def checkout_repo( - host: RemoteHost, - *, - branch: str = "main", - url: str, - git_clone_flags: str, - mapping: dict[str, tuple[str, str]], -) -> Optional[str]: - for prefix in mapping: - if not branch.startswith(prefix): - continue - tag = f"v{mapping[prefix][0]}-{mapping[prefix][1]}" - host.run_cmd(f"git clone {url} -b {tag} {git_clone_flags}") - return mapping[prefix][0] - - host.run_cmd(f"git clone {url} -b {branch} {git_clone_flags}") - return None - - -def build_torchvision( - host: RemoteHost, - *, - branch: str = "main", - use_conda: bool = True, - git_clone_flags: str, - run_smoke_tests: bool = True, -) -> str: - print("Checking out TorchVision repo") - build_version = checkout_repo( - host, - branch=branch, - url="https://github.com/pytorch/vision", - git_clone_flags=git_clone_flags, - mapping={ - "v1.7.1": ("0.8.2", "rc2"), - "v1.8.0": ("0.9.0", "rc3"), - "v1.8.1": ("0.9.1", "rc1"), - "v1.9.0": ("0.10.0", "rc1"), - "v1.10.0": ("0.11.1", "rc1"), - "v1.10.1": ("0.11.2", "rc1"), - "v1.10.2": ("0.11.3", "rc1"), - "v1.11.0": ("0.12.0", "rc1"), - "v1.12.0": ("0.13.0", "rc4"), - "v1.12.1": ("0.13.1", "rc6"), - "v1.13.0": ("0.14.0", "rc4"), - "v1.13.1": ("0.14.1", "rc2"), - "v2.0.0": ("0.15.1", "rc2"), - "v2.0.1": ("0.15.2", "rc2"), - }, - ) - print("Building TorchVision wheel") - - # Please note libnpg and jpeg are required to build image.so extension - if use_conda: - host.run_cmd("conda install -y libpng jpeg") - # Remove .so files to force static linking - host.run_cmd( - "rm miniforge3/lib/libpng.so miniforge3/lib/libpng16.so miniforge3/lib/libjpeg.so" - ) - # And patch setup.py to include libz dependency for libpng - host.run_cmd( - [ - 'sed -i -e \'s/image_link_flags\\.append("png")/image_link_flags += ["png", "z"]/\' vision/setup.py' - ] - ) - - build_vars = "" - if branch == "nightly": - version = host.check_output( - ["if [ -f vision/version.txt ]; then cat vision/version.txt; fi"] - ).strip() - if len(version) == 0: - # In older revisions, version was embedded in setup.py - version = ( - host.check_output(["grep", '"version = \'"', "vision/setup.py"]) - .strip() - .split("'")[1][:-2] - ) - build_date = ( - host.check_output("cd vision && git log --pretty=format:%s -1") - .strip() - .split()[0] - .replace("-", "") - ) - build_vars += f"BUILD_VERSION={version}.dev{build_date}" - elif build_version is not None: - build_vars += f"BUILD_VERSION={build_version} PYTORCH_VERSION={branch[1:].split('-', maxsplit=1)[0]}" - if host.using_docker(): - build_vars += " CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000" - - host.run_cmd(f"cd vision && {build_vars} python3 -m build --wheel --no-isolation") - vision_wheel_name = host.list_dir("vision/dist")[0] - embed_libgomp(host, use_conda, os.path.join("vision", "dist", vision_wheel_name)) - - print("Copying TorchVision wheel") - host.download_wheel(os.path.join("vision", "dist", vision_wheel_name)) - if run_smoke_tests: - host.run_cmd( - f"pip3 install {os.path.join('vision', 'dist', vision_wheel_name)}" - ) - host.run_cmd("python3 vision/test/smoke_test.py") - print("Delete vision checkout") - host.run_cmd("rm -rf vision") - - return vision_wheel_name - - -def build_torchdata( - host: RemoteHost, - *, - branch: str = "main", - use_conda: bool = True, - git_clone_flags: str = "", -) -> str: - print("Checking out TorchData repo") - git_clone_flags += " --recurse-submodules" - build_version = checkout_repo( - host, - branch=branch, - url="https://github.com/pytorch/data", - git_clone_flags=git_clone_flags, - mapping={ - "v1.13.1": ("0.5.1", ""), - "v2.0.0": ("0.6.0", "rc5"), - "v2.0.1": ("0.6.1", "rc1"), - }, - ) - print("Building TorchData wheel") - build_vars = "" - if branch == "nightly": - version = host.check_output( - ["if [ -f data/version.txt ]; then cat data/version.txt; fi"] - ).strip() - build_date = ( - host.check_output("cd data && git log --pretty=format:%s -1") - .strip() - .split()[0] - .replace("-", "") - ) - build_vars += f"BUILD_VERSION={version}.dev{build_date}" - elif build_version is not None: - build_vars += f"BUILD_VERSION={build_version} PYTORCH_VERSION={branch[1:].split('-', maxsplit=1)[0]}" - if host.using_docker(): - build_vars += " CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000" - - host.run_cmd(f"cd data && {build_vars} python3 -m build --wheel --no-isolation") - wheel_name = host.list_dir("data/dist")[0] - embed_libgomp(host, use_conda, os.path.join("data", "dist", wheel_name)) - - print("Copying TorchData wheel") - host.download_wheel(os.path.join("data", "dist", wheel_name)) - - return wheel_name - - -def build_torchtext( - host: RemoteHost, - *, - branch: str = "main", - use_conda: bool = True, - git_clone_flags: str = "", -) -> str: - print("Checking out TorchText repo") - git_clone_flags += " --recurse-submodules" - build_version = checkout_repo( - host, - branch=branch, - url="https://github.com/pytorch/text", - git_clone_flags=git_clone_flags, - mapping={ - "v1.9.0": ("0.10.0", "rc1"), - "v1.10.0": ("0.11.0", "rc2"), - "v1.10.1": ("0.11.1", "rc1"), - "v1.10.2": ("0.11.2", "rc1"), - "v1.11.0": ("0.12.0", "rc1"), - "v1.12.0": ("0.13.0", "rc2"), - "v1.12.1": ("0.13.1", "rc5"), - "v1.13.0": ("0.14.0", "rc3"), - "v1.13.1": ("0.14.1", "rc1"), - "v2.0.0": ("0.15.1", "rc2"), - "v2.0.1": ("0.15.2", "rc2"), - }, - ) - print("Building TorchText wheel") - build_vars = "" - if branch == "nightly": - version = host.check_output( - ["if [ -f text/version.txt ]; then cat text/version.txt; fi"] - ).strip() - build_date = ( - host.check_output("cd text && git log --pretty=format:%s -1") - .strip() - .split()[0] - .replace("-", "") - ) - build_vars += f"BUILD_VERSION={version}.dev{build_date}" - elif build_version is not None: - build_vars += f"BUILD_VERSION={build_version} PYTORCH_VERSION={branch[1:].split('-', maxsplit=1)[0]}" - if host.using_docker(): - build_vars += " CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000" - - host.run_cmd(f"cd text && {build_vars} python3 -m build --wheel --no-isolation") - wheel_name = host.list_dir("text/dist")[0] - embed_libgomp(host, use_conda, os.path.join("text", "dist", wheel_name)) - - print("Copying TorchText wheel") - host.download_wheel(os.path.join("text", "dist", wheel_name)) - - return wheel_name - - -def build_torchaudio( - host: RemoteHost, - *, - branch: str = "main", - use_conda: bool = True, - git_clone_flags: str = "", -) -> str: - print("Checking out TorchAudio repo") - git_clone_flags += " --recurse-submodules" - build_version = checkout_repo( - host, - branch=branch, - url="https://github.com/pytorch/audio", - git_clone_flags=git_clone_flags, - mapping={ - "v1.9.0": ("0.9.0", "rc2"), - "v1.10.0": ("0.10.0", "rc5"), - "v1.10.1": ("0.10.1", "rc1"), - "v1.10.2": ("0.10.2", "rc1"), - "v1.11.0": ("0.11.0", "rc1"), - "v1.12.0": ("0.12.0", "rc3"), - "v1.12.1": ("0.12.1", "rc5"), - "v1.13.0": ("0.13.0", "rc4"), - "v1.13.1": ("0.13.1", "rc2"), - "v2.0.0": ("2.0.1", "rc3"), - "v2.0.1": ("2.0.2", "rc2"), - }, - ) - print("Building TorchAudio wheel") - build_vars = "" - if branch == "nightly": - version = ( - host.check_output(["grep", '"version = \'"', "audio/setup.py"]) - .strip() - .split("'")[1][:-2] - ) - build_date = ( - host.check_output("cd audio && git log --pretty=format:%s -1") - .strip() - .split()[0] - .replace("-", "") - ) - build_vars += f"BUILD_VERSION={version}.dev{build_date}" - elif build_version is not None: - build_vars += f"BUILD_VERSION={build_version} PYTORCH_VERSION={branch[1:].split('-', maxsplit=1)[0]}" - if host.using_docker(): - build_vars += " CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000" - - host.run_cmd( - f"cd audio && export FFMPEG_ROOT=$(pwd)/third_party/ffmpeg && export USE_FFMPEG=1 \ - && ./packaging/ffmpeg/build.sh \ - && {build_vars} python3 -m build --wheel --no-isolation" - ) - - wheel_name = host.list_dir("audio/dist")[0] - embed_libgomp(host, use_conda, os.path.join("audio", "dist", wheel_name)) - - print("Copying TorchAudio wheel") - host.download_wheel(os.path.join("audio", "dist", wheel_name)) - - return wheel_name - - -def configure_system( - host: RemoteHost, - *, - compiler: str = "gcc-8", - use_conda: bool = True, - python_version: str = "3.8", -) -> None: - if use_conda: - install_condaforge_python(host, python_version) - - print("Configuring the system") - if not host.using_docker(): - update_apt_repo(host) - host.run_cmd("sudo apt-get install -y ninja-build g++ git cmake gfortran unzip") - else: - host.run_cmd("yum install -y sudo") - host.run_cmd("conda install -y ninja scons") - - if not use_conda: - host.run_cmd( - "sudo apt-get install -y python3-dev python3-yaml python3-setuptools python3-wheel python3-pip" - ) - host.run_cmd("pip3 install dataclasses typing-extensions") - if not use_conda: - print("Installing Cython + numpy from PyPy") - host.run_cmd("sudo pip3 install Cython") - host.run_cmd("sudo pip3 install numpy") - - -def build_domains( - host: RemoteHost, - *, - branch: str = "main", - use_conda: bool = True, - git_clone_flags: str = "", -) -> tuple[str, str, str, str]: - vision_wheel_name = build_torchvision( - host, branch=branch, use_conda=use_conda, git_clone_flags=git_clone_flags - ) - audio_wheel_name = build_torchaudio( - host, branch=branch, use_conda=use_conda, git_clone_flags=git_clone_flags - ) - data_wheel_name = build_torchdata( - host, branch=branch, use_conda=use_conda, git_clone_flags=git_clone_flags - ) - text_wheel_name = build_torchtext( - host, branch=branch, use_conda=use_conda, git_clone_flags=git_clone_flags - ) - return (vision_wheel_name, audio_wheel_name, data_wheel_name, text_wheel_name) - - -def start_build( - host: RemoteHost, - *, - branch: str = "main", - compiler: str = "gcc-8", - use_conda: bool = True, - python_version: str = "3.8", - pytorch_only: bool = False, - pytorch_build_number: Optional[str] = None, - shallow_clone: bool = True, - enable_mkldnn: bool = False, -) -> tuple[str, str, str, str, str]: - git_clone_flags = " --depth 1 --shallow-submodules" if shallow_clone else "" - if host.using_docker() and not use_conda: - print("Auto-selecting conda option for docker images") - use_conda = True - if not host.using_docker(): - print("Disable mkldnn for host builds") - enable_mkldnn = False - - configure_system( - host, compiler=compiler, use_conda=use_conda, python_version=python_version - ) - - if host.using_docker(): - print("Move libgfortant.a into a standard location") - # HACK: pypa gforntran.a is compiled without PIC, which leads to the following error - # libgfortran.a(error.o)(.text._gfortrani_st_printf+0x34): unresolvable R_AARCH64_ADR_PREL_PG_HI21 relocation against symbol `__stack_chk_guard@@GLIBC_2.17' # noqa: E501, B950 - # Workaround by copying gfortran library from the host - host.run_ssh_cmd("sudo apt-get install -y gfortran-8") - host.run_cmd("mkdir -p /usr/lib/gcc/aarch64-linux-gnu/8") - host.run_ssh_cmd( - [ - "docker", - "cp", - "/usr/lib/gcc/aarch64-linux-gnu/8/libgfortran.a", - f"{host.container_id}:/opt/rh/devtoolset-10/root/usr/lib/gcc/aarch64-redhat-linux/10/", - ] - ) - - print("Checking out PyTorch repo") - host.run_cmd( - f"git clone --recurse-submodules -b {branch} https://github.com/pytorch/pytorch {git_clone_flags}" - ) - - host.run_cmd("pytorch/.ci/docker/common/install_openblas.sh") - - print("Building PyTorch wheel") - build_opts = "" - if pytorch_build_number is not None: - build_opts += f" -C--build-option=--build-number={pytorch_build_number}" - # Breakpad build fails on aarch64 - build_vars = "USE_BREAKPAD=0 " - if branch == "nightly": - build_date = ( - host.check_output("cd pytorch && git log --pretty=format:%s -1") - .strip() - .split()[0] - .replace("-", "") - ) - version = host.check_output("cat pytorch/version.txt").strip()[:-2] - build_vars += f"BUILD_TEST=0 PYTORCH_BUILD_VERSION={version}.dev{build_date} PYTORCH_BUILD_NUMBER=1" - if branch.startswith(("v1.", "v2.")): - build_vars += f"BUILD_TEST=0 PYTORCH_BUILD_VERSION={branch[1 : branch.find('-')]} PYTORCH_BUILD_NUMBER=1" - if host.using_docker(): - build_vars += " CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000" - if enable_mkldnn: - host.run_cmd("pytorch/.ci/docker/common/install_acl.sh") - print("build pytorch with mkldnn+acl backend") - build_vars += " USE_MKLDNN=ON USE_MKLDNN_ACL=ON" - build_vars += " BLAS=OpenBLAS" - build_vars += " OpenBLAS_HOME=/opt/OpenBLAS" - build_vars += " ACL_ROOT_DIR=/acl" - host.run_cmd( - f"cd $HOME/pytorch && {build_vars} python3 -m build --wheel --no-isolation{build_opts}" - ) - print("Repair the wheel") - pytorch_wheel_name = host.list_dir("pytorch/dist")[0] - ld_library_path = "/acl/build:$HOME/pytorch/build/lib" - host.run_cmd( - f"export LD_LIBRARY_PATH={ld_library_path} && auditwheel repair $HOME/pytorch/dist/{pytorch_wheel_name}" - ) - print("replace the original wheel with the repaired one") - pytorch_repaired_wheel_name = host.list_dir("wheelhouse")[0] - host.run_cmd( - f"cp $HOME/wheelhouse/{pytorch_repaired_wheel_name} $HOME/pytorch/dist/{pytorch_wheel_name}" - ) - else: - print("build pytorch without mkldnn backend") - host.run_cmd( - f"cd pytorch && {build_vars} python3 -m build --wheel --no-isolation{build_opts}" - ) - - print("Deleting build folder") - host.run_cmd("cd pytorch && rm -rf build") - pytorch_wheel_name = host.list_dir("pytorch/dist")[0] - embed_libgomp(host, use_conda, os.path.join("pytorch", "dist", pytorch_wheel_name)) - print("Copying the wheel") - host.download_wheel(os.path.join("pytorch", "dist", pytorch_wheel_name)) - - print("Installing PyTorch wheel") - host.run_cmd(f"pip3 install pytorch/dist/{pytorch_wheel_name}") - - if pytorch_only: - return (pytorch_wheel_name, None, None, None, None) - domain_wheels = build_domains( - host, branch=branch, use_conda=use_conda, git_clone_flags=git_clone_flags - ) - - return (pytorch_wheel_name, *domain_wheels) - - -embed_library_script = """ -#!/usr/bin/env python3 - -from auditwheel.patcher import Patchelf -from auditwheel.wheeltools import InWheelCtx -from auditwheel.elfutils import elf_file_filter -from auditwheel.repair import copylib -from auditwheel.lddtree import lddtree -from subprocess import check_call -import os -import shutil -import sys -from tempfile import TemporaryDirectory - - -def replace_tag(filename): - with open(filename, 'r') as f: - lines = f.read().split("\\n") - for i,line in enumerate(lines): - if not line.startswith("Tag: "): - continue - lines[i] = line.replace("-linux_", "-manylinux2014_") - print(f'Updated tag from {line} to {lines[i]}') - - with open(filename, 'w') as f: - f.write("\\n".join(lines)) - - -class AlignedPatchelf(Patchelf): - def set_soname(self, file_name: str, new_soname: str) -> None: - check_call(['patchelf', '--page-size', '65536', '--set-soname', new_soname, file_name]) - - def replace_needed(self, file_name: str, soname: str, new_soname: str) -> None: - check_call(['patchelf', '--page-size', '65536', '--replace-needed', soname, new_soname, file_name]) - - -def embed_library(whl_path, lib_soname, update_tag=False): - patcher = AlignedPatchelf() - out_dir = TemporaryDirectory() - whl_name = os.path.basename(whl_path) - tmp_whl_name = os.path.join(out_dir.name, whl_name) - with InWheelCtx(whl_path) as ctx: - torchlib_path = os.path.join(ctx._tmpdir.name, 'torch', 'lib') - ctx.out_wheel=tmp_whl_name - new_lib_path, new_lib_soname = None, None - for filename, elf in elf_file_filter(ctx.iter_files()): - if not filename.startswith('torch/lib'): - continue - libtree = lddtree(filename) - if lib_soname not in libtree['needed']: - continue - lib_path = libtree['libs'][lib_soname]['path'] - if lib_path is None: - print(f"Can't embed {lib_soname} as it could not be found") - break - if lib_path.startswith(torchlib_path): - continue - - if new_lib_path is None: - new_lib_soname, new_lib_path = copylib(lib_path, torchlib_path, patcher) - patcher.replace_needed(filename, lib_soname, new_lib_soname) - print(f'Replacing {lib_soname} with {new_lib_soname} for {filename}') - if update_tag: - # Add manylinux2014 tag - for filename in ctx.iter_files(): - if os.path.basename(filename) != 'WHEEL': - continue - replace_tag(filename) - shutil.move(tmp_whl_name, whl_path) - - -if __name__ == '__main__': - embed_library(sys.argv[1], 'libgomp.so.1', len(sys.argv) > 2 and sys.argv[2] == '--update-tag') -""" - - -def run_tests(host: RemoteHost, whl: str, branch="main") -> None: - print("Configuring the system") - update_apt_repo(host) - host.run_cmd("sudo apt-get install -y python3-pip git") - host.run_cmd("sudo pip3 install Cython") - host.run_cmd("sudo pip3 install numpy") - host.upload_file(whl, ".") - host.run_cmd(f"sudo pip3 install {whl}") - host.run_cmd("python3 -c 'import torch;print(torch.rand((3,3))'") - host.run_cmd(f"git clone -b {branch} https://github.com/pytorch/pytorch") - host.run_cmd("cd pytorch/test; python3 test_torch.py -v") - - -def get_instance_name(instance) -> Optional[str]: - if instance.tags is None: - return None - for tag in instance.tags: - if tag["Key"] == "Name": - return tag["Value"] - return None - - -def list_instances(instance_type: str) -> None: - print(f"All instances of type {instance_type}") - for instance in ec2_instances_of_type(instance_type): - ifaces = instance.network_interfaces - az = ifaces[0].subnet.availability_zone if len(ifaces) > 0 else None - print( - f"{instance.id} {get_instance_name(instance)} {instance.public_dns_name} {instance.state['Name']} {az}" - ) - - -def terminate_instances(instance_type: str) -> None: - print(f"Terminating all instances of type {instance_type}") - instances = list(ec2_instances_of_type(instance_type)) - for instance in instances: - print(f"Terminating {instance.id}") - instance.terminate() - print("Waiting for termination to complete") - for instance in instances: - instance.wait_until_terminated() - - -def parse_arguments(): - from argparse import ArgumentParser - - parser = ArgumentParser("Build and test AARCH64 wheels using EC2") - parser.add_argument("--key-name", type=str) - parser.add_argument("--debug", action="store_true") - parser.add_argument("--build-only", action="store_true") - parser.add_argument("--test-only", type=str) - group = parser.add_mutually_exclusive_group() - group.add_argument("--os", type=str, choices=list(os_amis.keys())) - group.add_argument("--ami", type=str) - parser.add_argument( - "--python-version", - type=str, - choices=[f"3.{d}" for d in range(6, 12)], - default=None, - ) - parser.add_argument("--alloc-instance", action="store_true") - parser.add_argument("--list-instances", action="store_true") - parser.add_argument("--pytorch-only", action="store_true") - parser.add_argument("--keep-running", action="store_true") - parser.add_argument("--terminate-instances", action="store_true") - parser.add_argument("--instance-type", type=str, default="t4g.2xlarge") - parser.add_argument("--ebs-size", type=int, default=50) - parser.add_argument("--branch", type=str, default="main") - parser.add_argument("--use-docker", action="store_true") - parser.add_argument( - "--compiler", - type=str, - choices=["gcc-7", "gcc-8", "gcc-9", "clang"], - default="gcc-8", - ) - parser.add_argument("--use-torch-from-pypi", action="store_true") - parser.add_argument("--pytorch-build-number", type=str, default=None) - parser.add_argument("--disable-mkldnn", action="store_true") - return parser.parse_args() - - -if __name__ == "__main__": - args = parse_arguments() - ami = ( - args.ami - if args.ami is not None - else os_amis[args.os] - if args.os is not None - else ubuntu20_04_ami - ) - keyfile_path, key_name = compute_keyfile_path(args.key_name) - - if args.list_instances: - list_instances(args.instance_type) - sys.exit(0) - - if args.terminate_instances: - terminate_instances(args.instance_type) - sys.exit(0) - - if len(key_name) == 0: - raise RuntimeError(""" - Cannot start build without key_name, please specify - --key-name argument or AWS_KEY_NAME environment variable.""") - if len(keyfile_path) == 0 or not os.path.exists(keyfile_path): - raise RuntimeError(f""" - Cannot find keyfile with name: [{key_name}] in path: [{keyfile_path}], please - check `~/.ssh/` folder or manually set SSH_KEY_PATH environment variable.""") - - # Starting the instance - inst = start_instance( - key_name, ami=ami, instance_type=args.instance_type, ebs_size=args.ebs_size - ) - instance_name = f"{args.key_name}-{args.os}" - if args.python_version is not None: - instance_name += f"-py{args.python_version}" - inst.create_tags( - DryRun=False, - Tags=[ - { - "Key": "Name", - "Value": instance_name, - } - ], - ) - addr = inst.public_dns_name - wait_for_connection(addr, 22) - host = RemoteHost(addr, keyfile_path) - host.ami = ami - if args.use_docker: - update_apt_repo(host) - host.start_docker() - - if args.test_only: - run_tests(host, args.test_only) - sys.exit(0) - - if args.alloc_instance: - if args.python_version is None: - sys.exit(0) - install_condaforge_python(host, args.python_version) - sys.exit(0) - - python_version = args.python_version if args.python_version is not None else "3.10" - - if args.use_torch_from_pypi: - configure_system(host, compiler=args.compiler, python_version=python_version) - print("Installing PyTorch wheel") - host.run_cmd("pip3 install torch") - build_domains( - host, branch=args.branch, git_clone_flags=" --depth 1 --shallow-submodules" - ) - else: - start_build( - host, - branch=args.branch, - compiler=args.compiler, - python_version=python_version, - pytorch_only=args.pytorch_only, - pytorch_build_number=args.pytorch_build_number, - enable_mkldnn=not args.disable_mkldnn, - ) - if not args.keep_running: - print(f"Waiting for instance {inst.id} to terminate") - inst.terminate() - inst.wait_until_terminated() diff --git a/.ci/aarch64_linux/embed_library.py b/.ci/aarch64_linux/embed_library.py deleted file mode 100644 index 2834a4632989b..0000000000000 --- a/.ci/aarch64_linux/embed_library.py +++ /dev/null @@ -1,87 +0,0 @@ -#!/usr/bin/env python3 - -import os -import shutil -import sys -from subprocess import check_call -from tempfile import TemporaryDirectory - -from auditwheel.elfutils import elf_file_filter -from auditwheel.lddtree import lddtree -from auditwheel.patcher import Patchelf -from auditwheel.repair import copylib -from auditwheel.wheeltools import InWheelCtx - - -def replace_tag(filename): - with open(filename) as f: - lines = f.read().split("\\n") - for i, line in enumerate(lines): - if not line.startswith("Tag: "): - continue - lines[i] = line.replace("-linux_", "-manylinux2014_") - print(f"Updated tag from {line} to {lines[i]}") - - with open(filename, "w") as f: - f.write("\\n".join(lines)) - - -class AlignedPatchelf(Patchelf): - def set_soname(self, file_name: str, new_soname: str) -> None: - check_call( - ["patchelf", "--page-size", "65536", "--set-soname", new_soname, file_name] - ) - - def replace_needed(self, file_name: str, soname: str, new_soname: str) -> None: - check_call( - [ - "patchelf", - "--page-size", - "65536", - "--replace-needed", - soname, - new_soname, - file_name, - ] - ) - - -def embed_library(whl_path, lib_soname, update_tag=False): - patcher = AlignedPatchelf() - out_dir = TemporaryDirectory() - whl_name = os.path.basename(whl_path) - tmp_whl_name = os.path.join(out_dir.name, whl_name) - with InWheelCtx(whl_path) as ctx: - torchlib_path = os.path.join(ctx._tmpdir.name, "torch", "lib") - ctx.out_wheel = tmp_whl_name - new_lib_path, new_lib_soname = None, None - for filename, _ in elf_file_filter(ctx.iter_files()): - if not filename.startswith("torch/lib"): - continue - libtree = lddtree(filename) - if lib_soname not in libtree["needed"]: - continue - lib_path = libtree["libs"][lib_soname]["path"] - if lib_path is None: - print(f"Can't embed {lib_soname} as it could not be found") - break - if lib_path.startswith(torchlib_path): - continue - - if new_lib_path is None: - new_lib_soname, new_lib_path = copylib(lib_path, torchlib_path, patcher) - patcher.replace_needed(filename, lib_soname, new_lib_soname) - print(f"Replacing {lib_soname} with {new_lib_soname} for {filename}") - if update_tag: - # Add manylinux2014 tag - for filename in ctx.iter_files(): - if os.path.basename(filename) != "WHEEL": - continue - replace_tag(filename) - shutil.move(tmp_whl_name, whl_path) - - -if __name__ == "__main__": - embed_library( - sys.argv[1], "libgomp.so.1", len(sys.argv) > 2 and sys.argv[2] == "--update-tag" - ) diff --git a/.ci/docker/almalinux/Dockerfile b/.ci/docker/almalinux/Dockerfile index ce7803cf9acd2..3bc3fd8badc6d 100644 --- a/.ci/docker/almalinux/Dockerfile +++ b/.ci/docker/almalinux/Dockerfile @@ -7,13 +7,13 @@ ENV LC_ALL en_US.UTF-8 ENV LANG en_US.UTF-8 ENV LANGUAGE en_US.UTF-8 -ARG DEVTOOLSET_VERSION=11 +ARG DEVTOOLSET_VERSION=13 RUN yum -y update RUN yum -y install epel-release # install glibc-langpack-en make sure en_US.UTF-8 locale is available RUN yum -y install glibc-langpack-en -RUN yum install -y sudo wget curl perl util-linux xz bzip2 git patch which perl zlib-devel openssl-devel yum-utils autoconf automake make gcc-toolset-${DEVTOOLSET_VERSION}-toolchain +RUN yum install -y sudo wget curl perl util-linux xz bzip2 git patch which perl zlib-devel openssl-devel yum-utils autoconf automake make gcc-toolset-${DEVTOOLSET_VERSION}-gcc gcc-toolset-${DEVTOOLSET_VERSION}-gcc-c++ gcc-toolset-${DEVTOOLSET_VERSION}-gcc-gfortran gcc-toolset-${DEVTOOLSET_VERSION}-gdb # Just add everything as a safe.directory for git since these will be used in multiple places with git RUN git config --global --add safe.directory '*' ENV PATH=/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/bin:$PATH @@ -41,6 +41,7 @@ RUN bash ./install_conda.sh && rm install_conda.sh # Install CUDA FROM base as cuda ARG CUDA_VERSION=12.6 +ARG DEVTOOLSET_VERSION=13 RUN rm -rf /usr/local/cuda-* ADD ./common/install_cuda.sh install_cuda.sh COPY ./common/install_nccl.sh install_nccl.sh @@ -50,7 +51,8 @@ ENV CUDA_HOME=/usr/local/cuda-${CUDA_VERSION} # Preserve CUDA_VERSION for the builds ENV CUDA_VERSION=${CUDA_VERSION} # Make things in our path by default -ENV PATH=/usr/local/cuda-${CUDA_VERSION}/bin:$PATH +ENV PATH=/usr/local/cuda-${CUDA_VERSION}/bin:/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/bin:$PATH + FROM cuda as cuda12.6 RUN bash ./install_cuda.sh 12.6 @@ -68,8 +70,22 @@ FROM cuda as cuda13.0 RUN bash ./install_cuda.sh 13.0 ENV DESIRED_CUDA=13.0 -FROM ${ROCM_IMAGE} as rocm +FROM ${ROCM_IMAGE} as rocm_base +ARG DEVTOOLSET_VERSION=13 +ENV LC_ALL en_US.UTF-8 +ENV LANG en_US.UTF-8 +ENV LANGUAGE en_US.UTF-8 +# Install devtoolset on ROCm base image +RUN yum -y update && \ + yum -y install epel-release && \ + yum -y install glibc-langpack-en && \ + yum install -y sudo wget curl perl util-linux xz bzip2 git patch which perl zlib-devel openssl-devel yum-utils autoconf automake make gcc-toolset-${DEVTOOLSET_VERSION}-gcc gcc-toolset-${DEVTOOLSET_VERSION}-gcc-c++ gcc-toolset-${DEVTOOLSET_VERSION}-gcc-gfortran gcc-toolset-${DEVTOOLSET_VERSION}-gdb +RUN git config --global --add safe.directory '*' +ENV PATH=/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/bin:$PATH + +FROM rocm_base as rocm ARG PYTORCH_ROCM_ARCH +ARG DEVTOOLSET_VERSION=13 ENV PYTORCH_ROCM_ARCH ${PYTORCH_ROCM_ARCH} ADD ./common/install_mkl.sh install_mkl.sh RUN bash ./install_mkl.sh && rm install_mkl.sh @@ -88,6 +104,7 @@ COPY --from=cuda13.0 /usr/local/cuda-13.0 /usr/local/cuda-13.0 # Final step FROM ${BASE_TARGET} as final +ARG DEVTOOLSET_VERSION=13 COPY --from=openssl /opt/openssl /opt/openssl COPY --from=patchelf /patchelf /usr/local/bin/patchelf COPY --from=conda /opt/conda /opt/conda diff --git a/.ci/docker/almalinux/build.sh b/.ci/docker/almalinux/build.sh index ad234ce1ffb93..468f9b06418f7 100755 --- a/.ci/docker/almalinux/build.sh +++ b/.ci/docker/almalinux/build.sh @@ -36,11 +36,7 @@ case ${DOCKER_TAG_PREFIX} in ;; rocm*) BASE_TARGET=rocm - PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201" - # add gfx950, gfx115x conditionally starting in ROCm 7.0 - if [[ "$ROCM_VERSION" == *"7.0"* ]]; then - PYTORCH_ROCM_ARCH="${PYTORCH_ROCM_ARCH};gfx950;gfx1150;gfx1151" - fi + PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201;gfx950;gfx1150;gfx1151" EXTRA_BUILD_ARGS="${EXTRA_BUILD_ARGS} --build-arg PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH}" ;; *) @@ -63,7 +59,7 @@ docker build \ --target final \ --progress plain \ --build-arg "BASE_TARGET=${BASE_TARGET}" \ - --build-arg "DEVTOOLSET_VERSION=11" \ + --build-arg "DEVTOOLSET_VERSION=13" \ ${EXTRA_BUILD_ARGS} \ -t ${tmp_tag} \ $@ \ diff --git a/.ci/docker/build.sh b/.ci/docker/build.sh index d0500b89780ce..203ab597a75bc 100755 --- a/.ci/docker/build.sh +++ b/.ci/docker/build.sh @@ -168,6 +168,18 @@ case "$tag" in VISION=yes TRITON=yes ;; + pytorch-linux-jammy-py3.11-clang12) + ANACONDA_PYTHON_VERSION=3.11 + CLANG_VERSION=12 + VISION=no + TRITON=no + ;; + pytorch-linux-jammy-py3.12-clang12) + ANACONDA_PYTHON_VERSION=3.12 + CLANG_VERSION=12 + VISION=no + TRITON=no + ;; pytorch-linux-jammy-rocm-n-py3 | pytorch-linux-jammy-rocm-n-py3-benchmarks | pytorch-linux-noble-rocm-n-py3) if [[ $tag =~ "jammy" ]]; then ANACONDA_PYTHON_VERSION=3.10 @@ -195,9 +207,9 @@ case "$tag" in NINJA_VERSION=1.9.0 TRITON=yes ;; - pytorch-linux-jammy-xpu-n-py3 | pytorch-linux-jammy-xpu-n-py3-inductor-benchmarks) + pytorch-linux-noble-xpu-n-py3 | pytorch-linux-noble-xpu-n-py3-inductor-benchmarks) ANACONDA_PYTHON_VERSION=3.10 - GCC_VERSION=11 + GCC_VERSION=13 VISION=yes XPU_VERSION=2025.2 NINJA_VERSION=1.9.0 @@ -248,6 +260,12 @@ case "$tag" in HALIDE=yes TRITON=yes ;; + pytorch-linux-jammy-cuda12.8-py3.12-pallas) + CUDA_VERSION=12.8.1 + ANACONDA_PYTHON_VERSION=3.12 + GCC_VERSION=11 + PALLAS=yes + ;; pytorch-linux-jammy-py3.12-triton-cpu) CUDA_VERSION=12.6 ANACONDA_PYTHON_VERSION=3.12 @@ -261,9 +279,9 @@ case "$tag" in PYTHON_VERSION=3.10 CUDA_VERSION=12.8.1 ;; - pytorch-linux-jammy-aarch64-py3.10-gcc11) + pytorch-linux-jammy-aarch64-py3.10-gcc13) ANACONDA_PYTHON_VERSION=3.10 - GCC_VERSION=11 + GCC_VERSION=13 ACL=yes VISION=yes OPENBLAS=yes @@ -271,9 +289,19 @@ case "$tag" in # from pytorch/llvm:9.0.1 is x86 specific SKIP_LLVM_SRC_BUILD_INSTALL=yes ;; - pytorch-linux-jammy-aarch64-py3.10-gcc11-inductor-benchmarks) + pytorch-linux-jammy-aarch64-py3.10-clang21) ANACONDA_PYTHON_VERSION=3.10 - GCC_VERSION=11 + CLANG_VERSION=21 + ACL=yes + VISION=yes + OPENBLAS=yes + # snadampal: skipping llvm src build install because the current version + # from pytorch/llvm:9.0.1 is x86 specific + SKIP_LLVM_SRC_BUILD_INSTALL=yes + ;; + pytorch-linux-jammy-aarch64-py3.10-gcc13-inductor-benchmarks) + ANACONDA_PYTHON_VERSION=3.10 + GCC_VERSION=13 ACL=yes VISION=yes OPENBLAS=yes @@ -359,6 +387,7 @@ docker build \ --build-arg "INDUCTOR_BENCHMARKS=${INDUCTOR_BENCHMARKS}" \ --build-arg "EXECUTORCH=${EXECUTORCH}" \ --build-arg "HALIDE=${HALIDE}" \ + --build-arg "PALLAS=${PALLAS}" \ --build-arg "XPU_VERSION=${XPU_VERSION}" \ --build-arg "UNINSTALL_DILL=${UNINSTALL_DILL}" \ --build-arg "ACL=${ACL:-}" \ diff --git a/.ci/docker/ci_commit_pins/jax.txt b/.ci/docker/ci_commit_pins/jax.txt new file mode 100644 index 0000000000000..a3df0a6959e15 --- /dev/null +++ b/.ci/docker/ci_commit_pins/jax.txt @@ -0,0 +1 @@ +0.8.0 diff --git a/.ci/docker/common/install_clang.sh b/.ci/docker/common/install_clang.sh index 1cb216edf1b38..93daeee919b3d 100755 --- a/.ci/docker/common/install_clang.sh +++ b/.ci/docker/common/install_clang.sh @@ -8,8 +8,8 @@ if [ -n "$CLANG_VERSION" ]; then # work around ubuntu apt-get conflicts sudo apt-get -y -f install wget --no-check-certificate -O - https://apt.llvm.org/llvm-snapshot.gpg.key | sudo apt-key add - - if [[ $CLANG_VERSION == 18 ]]; then - apt-add-repository "deb http://apt.llvm.org/jammy/ llvm-toolchain-jammy-18 main" + if [[ $CLANG_VERSION -ge 18 ]]; then + apt-add-repository "deb http://apt.llvm.org/jammy/ llvm-toolchain-jammy-${CLANG_VERSION} main" fi fi diff --git a/.ci/docker/common/install_gcc.sh b/.ci/docker/common/install_gcc.sh index 3b96bf6e0ed2f..df1c059bc3869 100644 --- a/.ci/docker/common/install_gcc.sh +++ b/.ci/docker/common/install_gcc.sh @@ -7,11 +7,11 @@ if [ -n "$GCC_VERSION" ]; then # Need the official toolchain repo to get alternate packages add-apt-repository ppa:ubuntu-toolchain-r/test apt-get update - apt-get install -y g++-$GCC_VERSION + apt-get install -y g++-$GCC_VERSION gfortran-$GCC_VERSION update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-"$GCC_VERSION" 50 update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-"$GCC_VERSION" 50 update-alternatives --install /usr/bin/gcov gcov /usr/bin/gcov-"$GCC_VERSION" 50 - + update-alternatives --install /usr/bin/gfortran gfortran /usr/bin/gfortran-"$GCC_VERSION" 50 # Cleanup package manager apt-get autoclean && apt-get clean diff --git a/.ci/docker/common/install_jax.sh b/.ci/docker/common/install_jax.sh new file mode 100755 index 0000000000000..184aedf0f94fe --- /dev/null +++ b/.ci/docker/common/install_jax.sh @@ -0,0 +1,40 @@ +#!/bin/bash + +set -ex + +source "$(dirname "${BASH_SOURCE[0]}")/common_utils.sh" + +# Get the pinned JAX version (same for all CUDA versions) +JAX_VERSION=$(get_pinned_commit /ci_commit_pins/jax) + +function install_jax_12() { + echo "Installing JAX ${JAX_VERSION} with CUDA 12 support" + pip_install "jax[cuda12]==${JAX_VERSION}" -f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html + + # Verify installation + python -c "import jax" # check for errors + echo "JAX ${JAX_VERSION} installation completed successfully for CUDA 12" +} + +function install_jax_13() { + echo "Installing JAX ${JAX_VERSION} with CUDA 13 support" + pip_install "jax[cuda13]==${JAX_VERSION}" -f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html + + # Verify installation + python -c "import jax" # check for errors + echo "JAX ${JAX_VERSION} installation completed successfully for CUDA 13" +} + +# idiomatic parameter and option handling in sh +while test $# -gt 0 +do + case "$1" in + 12.4|12.6|12.6.*|12.8|12.8.*|12.9|12.9.*) install_jax_12; + ;; + 13.0|13.0.*) install_jax_13; + ;; + *) echo "bad argument $1"; exit 1 + ;; + esac + shift +done diff --git a/.ci/docker/common/install_libgomp.sh b/.ci/docker/common/install_libgomp.sh new file mode 100644 index 0000000000000..308915ec4f618 --- /dev/null +++ b/.ci/docker/common/install_libgomp.sh @@ -0,0 +1,56 @@ +#!/bin/bash +# Script used only in CD pipeline + +set -ex + +# install dependencies +dnf -y install gmp-devel libmpc-devel texinfo flex bison + +cd /usr/local/src +# fetch source for gcc 13 +git clone --depth 1 --single-branch -b releases/gcc-13.3.0 https://github.com/gcc-mirror/gcc.git gcc-13.3.0 + +mkdir -p gcc-13.3.0/build-gomp +cd gcc-13.3.0/build-gomp + +# configure gcc build +# I got these flags by: +# 1. downloading the source rpm for gcc-11 on AlmaLinux 8 container +# dnf install -y dnf-plugins-core rpmdevtools +# dnf download --source libgomp +# 2. extracting the gcc.spec from the source. +# rpmdev-extract gcc-xx.src.rpm +# 3. extracting optflags and ld_flags from gcc.spec: +# rpm --eval '%{optflags}' +# rpm --eval '%{build_ldflags}' +# +# I had to remove the following flags because they didn't compile for this version of libgomp: +# -Werror=format-security +# -specs=/usr/lib/rpm/redhat/redhat-hardened-cc1 +# -specs=/usr/lib/rpm/redhat/redhat-annobin-cc1 +# +# I added -march=armv8-a -mtune=generic to make them explicit. I don't think they're strictly needed. + +OPT_FLAGS='-O2 -march=armv8-a -mtune=generic'\ +' -fexceptions -g -grecord-gcc-switches -pipe -Wall'\ +' -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS'\ +' -fstack-protector-strong -fasynchronous-unwind-tables'\ +' -fstack-clash-protection' + +LDFLAGS='-Wl,-z,relro -Wl,--as-needed -Wl,-z,now' + +CFLAGS="$OPT_FLAGS" \ +CXXFLAGS="$OPT_FLAGS" \ +LDFLAGS="$LDFLAGS" \ +../configure \ + --prefix=/usr \ + --libdir=/usr/lib64 \ + --enable-languages=c,c++ \ + --disable-multilib \ + --disable-bootstrap \ + --enable-libgomp + +# only build libgomp +make -j$(nproc) all-target-libgomp + +make install-target-libgomp \ No newline at end of file diff --git a/.ci/docker/common/install_openblas.sh b/.ci/docker/common/install_openblas.sh index 2f386c6bd523a..5a28068781245 100755 --- a/.ci/docker/common/install_openblas.sh +++ b/.ci/docker/common/install_openblas.sh @@ -10,6 +10,7 @@ git clone https://github.com/OpenMathLib/OpenBLAS.git -b "${OPENBLAS_VERSION}" - OPENBLAS_CHECKOUT_DIR="OpenBLAS" OPENBLAS_BUILD_FLAGS=" +CC=gcc NUM_THREADS=128 USE_OPENMP=1 NO_SHARED=0 diff --git a/.ci/docker/common/install_xpu.sh b/.ci/docker/common/install_xpu.sh index 0b150872f93ce..22b7af890c1f6 100644 --- a/.ci/docker/common/install_xpu.sh +++ b/.ci/docker/common/install_xpu.sh @@ -9,7 +9,7 @@ set -xe function install_ubuntu() { . /etc/os-release - if [[ ! " jammy " =~ " ${VERSION_CODENAME} " ]]; then + if [[ ! " jammy noble " =~ " ${VERSION_CODENAME} " ]]; then echo "Ubuntu version ${VERSION_CODENAME} not supported" exit fi @@ -35,25 +35,24 @@ function install_ubuntu() { # The xpu-smi packages apt-get install -y flex bison xpu-smi - if [[ "${XPU_DRIVER_TYPE,,}" == "lts" ]]; then - # Compute and Media Runtimes + # Compute and Media Runtimes + if [[ " ${VERSION_CODENAME} " =~ " noble " ]]; then apt-get install -y \ - intel-opencl-icd intel-level-zero-gpu level-zero \ - intel-media-va-driver-non-free libmfx1 libmfxgen1 libvpl2 \ - libegl-mesa0 libegl1-mesa libegl1-mesa-dev libgbm1 libgl1-mesa-dev libgl1-mesa-dri \ + intel-opencl-icd libze-intel-gpu1 libze1 \ + intel-media-va-driver-non-free libmfx-gen1 libvpl2 \ + libegl-mesa0 libegl1-mesa-dev libgbm1 libgl1-mesa-dev libgl1-mesa-dri \ libglapi-mesa libgles2-mesa-dev libglx-mesa0 libigdgmm12 libxatracker2 mesa-va-drivers \ - mesa-vdpau-drivers mesa-vulkan-drivers va-driver-all vainfo hwinfo clinfo - # Development Packages - apt-get install -y libigc-dev intel-igc-cm libigdfcl-dev libigfxcmrt-dev level-zero-dev - else # rolling driver + mesa-vdpau-drivers mesa-vulkan-drivers va-driver-all vainfo hwinfo clinfo intel-ocloc + else # jammy apt-get install -y \ intel-opencl-icd libze-intel-gpu1 libze1 \ intel-media-va-driver-non-free libmfx-gen1 libvpl2 \ libegl-mesa0 libegl1-mesa libegl1-mesa-dev libgbm1 libgl1-mesa-dev libgl1-mesa-dri \ libglapi-mesa libglx-mesa0 libigdgmm12 libxatracker2 mesa-va-drivers \ mesa-vdpau-drivers mesa-vulkan-drivers va-driver-all vainfo hwinfo clinfo intel-ocloc - apt-get install -y libigc-dev intel-igc-cm libigdfcl-dev libigfxcmrt-dev libze-dev fi + # Development Packages + apt-get install -y libigc-dev intel-igc-cm libigdfcl-dev libigfxcmrt-dev libze-dev # Install Intel Support Packages apt-get install -y ${XPU_PACKAGES} @@ -66,7 +65,7 @@ function install_ubuntu() { function install_rhel() { . /etc/os-release if [[ "${ID}" == "rhel" ]]; then - if [[ ! " 8.8 8.9 9.0 9.2 9.3 " =~ " ${VERSION_ID} " ]]; then + if [[ ! " 8.8 8.10 9.0 9.2 9.3 " =~ " ${VERSION_ID} " ]]; then echo "RHEL version ${VERSION_ID} not supported" exit fi @@ -147,7 +146,7 @@ function install_sles() { XPU_DRIVER_VERSION="" if [[ "${XPU_DRIVER_TYPE,,}" == "lts" ]]; then # Use GPU driver LTS releases - XPU_DRIVER_VERSION="/lts/2350" + XPU_DRIVER_VERSION="/lts/2523" fi # Default use Intel® oneAPI Deep Learning Essentials 2025.1 diff --git a/.ci/docker/libtorch/build.sh b/.ci/docker/libtorch/build.sh index c40896cb5499f..76d3e01e1c38f 100755 --- a/.ci/docker/libtorch/build.sh +++ b/.ci/docker/libtorch/build.sh @@ -49,11 +49,7 @@ case ${DOCKER_TAG_PREFIX} in fi BASE_TARGET=rocm GPU_IMAGE=rocm/dev-ubuntu-22.04:${GPU_ARCH_VERSION}-complete - PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201" - # add gfx950, gfx115x conditionally starting in ROCm 7.0 - if [[ "$GPU_ARCH_VERSION" == *"7.0"* ]]; then - PYTORCH_ROCM_ARCH="${PYTORCH_ROCM_ARCH};gfx950;gfx1150;gfx1151" - fi + PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201;gfx950;gfx1150;gfx1151" DOCKER_GPU_BUILD_ARG="--build-arg PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH} --build-arg ROCM_VERSION=${GPU_ARCH_VERSION}" ;; *) diff --git a/.ci/docker/manywheel/Dockerfile_2_28_aarch64 b/.ci/docker/manywheel/Dockerfile_2_28_aarch64 index 768db09929361..78ee09d128cb0 100644 --- a/.ci/docker/manywheel/Dockerfile_2_28_aarch64 +++ b/.ci/docker/manywheel/Dockerfile_2_28_aarch64 @@ -50,6 +50,10 @@ RUN rm install_ninja.sh ENV PATH=/opt/rh/gcc-toolset-${GCCTOOLSET_VERSION}/root/usr/bin:$PATH ENV LD_LIBRARY_PATH=/opt/rh/gcc-toolset-${GCCTOOLSET_VERSION}/root/usr/lib64:/opt/rh/gcc-toolset-${GCCTOOLSET_VERSION}/root/usr/lib:$LD_LIBRARY_PATH +# Build a newer version of libgomp than that supported in in Almalinux 8. +COPY ./common/install_libgomp.sh install_libgomp.sh +RUN bash ./install_libgomp.sh && rm install_libgomp.sh + # git236+ would refuse to run git commands in repos owned by other users # Which causes version check to fail, as pytorch repo is bind-mounted into the image # Override this behaviour by treating every folder as safe diff --git a/.ci/docker/manywheel/build.sh b/.ci/docker/manywheel/build.sh index ac385ce4b29fd..8f9059dc0cc12 100755 --- a/.ci/docker/manywheel/build.sh +++ b/.ci/docker/manywheel/build.sh @@ -87,11 +87,7 @@ case ${image} in MANY_LINUX_VERSION="2_28" DEVTOOLSET_VERSION="11" GPU_IMAGE=rocm/dev-almalinux-8:${GPU_ARCH_VERSION}-complete - PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201" - # add gfx950, gfx115x conditionally starting in ROCm 7.0 - if [[ "$GPU_ARCH_VERSION" == *"7.0"* ]]; then - PYTORCH_ROCM_ARCH="${PYTORCH_ROCM_ARCH};gfx950;gfx1150;gfx1151" - fi + PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201;gfx950;gfx1150;gfx1151" DOCKER_GPU_BUILD_ARG="--build-arg ROCM_VERSION=${GPU_ARCH_VERSION} --build-arg PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH} --build-arg DEVTOOLSET_VERSION=${DEVTOOLSET_VERSION}" ;; manylinux2_28-builder:xpu) diff --git a/.ci/docker/requirements-docs.txt b/.ci/docker/requirements-docs.txt index 6e623b4c56949..de71919012e13 100644 --- a/.ci/docker/requirements-docs.txt +++ b/.ci/docker/requirements-docs.txt @@ -1,15 +1,11 @@ -sphinx==5.3.0 +sphinx==7.2.6 #Description: This is used to generate PyTorch docs -#Pinned versions: 5.3.0 +#Pinned versions: 7.2.6 -standard-imghdr==3.13.0; python_version >= "3.13" -#Description: This is needed by Sphinx, so it needs to be added here. -# The reasons are as follows: -# 1) This module has been removed from the Python standard library since Python 3.13(https://peps.python.org/pep-0594/#imghdr); -# 2) The current version of Sphinx (5.3.0) is not compatible with Python 3.13. -# Once Sphinx is upgraded to a version compatible with Python 3.13 or later, we can remove this dependency. +pytorch_sphinx_theme2==0.2.0 +#Description: This is needed to generate PyTorch docs +#Pinned versions: 0.2.0 --e git+https://github.com/pytorch/pytorch_sphinx_theme.git@71e55749be14ceb56e7f8211a9fb649866b87ad4#egg=pytorch_sphinx_theme2 # TODO: sphinxcontrib.katex 0.9.0 adds a local KaTeX server to speed up pre-rendering # but it doesn't seem to work and hangs around idly. The initial thought that it is probably # something related to Docker setup. We can investigate this later. @@ -36,17 +32,17 @@ tensorboard==2.18.0 ; python_version >= "3.13" #Description: This is used to generate PyTorch docs #Pinned versions: 2.13.0 -breathe==4.34.0 +breathe==4.36.0 #Description: This is used to generate PyTorch C++ docs -#Pinned versions: 4.34.0 +#Pinned versions: 4.36.0 -exhale==0.2.3 +exhale==0.3.7 #Description: This is used to generate PyTorch C++ docs -#Pinned versions: 0.2.3 +#Pinned versions: 0.3.7 -docutils==0.16 +docutils==0.20 #Description: This is used to generate PyTorch C++ docs -#Pinned versions: 0.16 +#Pinned versions: 0.20 bs4==0.0.1 #Description: This is used to generate PyTorch C++ docs @@ -56,13 +52,13 @@ IPython==8.12.0 #Description: This is used to generate PyTorch functorch docs #Pinned versions: 8.12.0 -myst-nb==0.17.2 +myst-nb==1.3.0 #Description: This is used to generate PyTorch functorch and torch.compile docs. -#Pinned versions: 0.17.2 +#Pinned versions: 1.3.0 # The following are required to build torch.distributed.elastic.rendezvous.etcd* docs python-etcd==0.4.5 sphinx-copybutton==0.5.0 -sphinx-design==0.4.0 +sphinx-design==0.6.1 sphinxcontrib-mermaid==1.0.0 -myst-parser==0.18.1 +myst-parser==4.0.1 diff --git a/.ci/docker/ubuntu/Dockerfile b/.ci/docker/ubuntu/Dockerfile index 84a74114c381e..2081dcbdffd17 100644 --- a/.ci/docker/ubuntu/Dockerfile +++ b/.ci/docker/ubuntu/Dockerfile @@ -143,6 +143,15 @@ COPY ci_commit_pins/halide.txt halide.txt RUN if [ -n "${HALIDE}" ]; then bash ./install_halide.sh; fi RUN rm install_halide.sh common_utils.sh halide.txt +ARG PALLAS +ARG CUDA_VERSION +# Install JAX with CUDA support (for Pallas) +COPY ./common/install_jax.sh install_jax.sh +COPY ./common/common_utils.sh common_utils.sh +COPY ./ci_commit_pins/jax.txt /ci_commit_pins/jax.txt +RUN if [ -n "${PALLAS}" ]; then bash ./install_jax.sh ${CUDA_VERSION}; fi +RUN rm -f install_jax.sh common_utils.sh /ci_commit_pins/jax.txt + ARG ONNX # Install ONNX dependencies COPY ./common/install_onnx.sh ./common/common_utils.sh ./ diff --git a/.ci/lumen_cli/cli/lib/common/cli_helper.py b/.ci/lumen_cli/cli/lib/common/cli_helper.py index 927ca09fe7230..4086eb7d46e81 100644 --- a/.ci/lumen_cli/cli/lib/common/cli_helper.py +++ b/.ci/lumen_cli/cli/lib/common/cli_helper.py @@ -8,9 +8,11 @@ try: - from typing import Any, Callable, Required, TypedDict # Python 3.11+ + from collections.abc import Callable # Python 3.11+ + from typing import Any, Required, TypedDict except ImportError: - from typing import Any, Callable, TypedDict + from collections.abc import Callable + from typing import Any, TypedDict from typing_extensions import Required # Fallback for Python <3.11 diff --git a/.ci/magma-rocm/README.md b/.ci/magma-rocm/README.md index cfc3cd3ab1632..3fe1e5d976ccd 100644 --- a/.ci/magma-rocm/README.md +++ b/.ci/magma-rocm/README.md @@ -30,7 +30,6 @@ into a tarball, with the following structure: More specifically, `build_magma.sh` copies over the relevant files from the `package_files` directory depending on the ROCm version. Outputted binaries should be in the `output` folder. - ## Pushing Packages can be uploaded to an S3 bucket using: diff --git a/.ci/magma-rocm/build_magma.sh b/.ci/magma-rocm/build_magma.sh index 7d95fed873dc0..c7c7780227ea5 100755 --- a/.ci/magma-rocm/build_magma.sh +++ b/.ci/magma-rocm/build_magma.sh @@ -6,8 +6,8 @@ set -eou pipefail # The script expects DESIRED_CUDA and PACKAGE_NAME to be set ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" -# post merge of https://github.com/icl-utk-edu/magma/pull/65 -MAGMA_VERSION=c0792ae825fb36872784892ea643dd6f3456bc5f +# https://github.com/icl-utk-edu/magma/pull/65 +MAGMA_VERSION=d6e4117bc88e73f06d26c6c2e14f064e8fc3d1ec # Folders for the build PACKAGE_FILES=${ROOT_DIR}/magma-rocm/package_files # metadata @@ -20,7 +20,7 @@ mkdir -p ${PACKAGE_DIR} ${PACKAGE_OUTPUT}/linux-64 ${PACKAGE_BUILD} ${PACKAGE_RE # Fetch magma sources and verify checksum pushd ${PACKAGE_DIR} -git clone https://github.com/icl-utk-edu/magma +git clone https://github.com/jeffdaily/magma pushd magma git checkout ${MAGMA_VERSION} popd diff --git a/.ci/manywheel/build.sh b/.ci/manywheel/build.sh index 6b2a60bc5ca28..ed8cda785d26a 100755 --- a/.ci/manywheel/build.sh +++ b/.ci/manywheel/build.sh @@ -4,14 +4,17 @@ set -ex SCRIPTPATH="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" +# Source the common build script for architecture-specific configurations (MKLDNN, ACL, etc.) +source "${SCRIPTPATH}/../pytorch/build.sh" || true + case "${GPU_ARCH_TYPE:-BLANK}" in - cuda) + cuda | cuda-aarch64) bash "${SCRIPTPATH}/build_cuda.sh" ;; rocm) bash "${SCRIPTPATH}/build_rocm.sh" ;; - cpu | cpu-cxx11-abi | cpu-s390x) + cpu | cpu-cxx11-abi | cpu-aarch64 | cpu-s390x) bash "${SCRIPTPATH}/build_cpu.sh" ;; xpu) diff --git a/.ci/manywheel/build_common.sh b/.ci/manywheel/build_common.sh index b84268fd12896..d0220575aadc5 100644 --- a/.ci/manywheel/build_common.sh +++ b/.ci/manywheel/build_common.sh @@ -18,12 +18,31 @@ retry () { $* || (sleep 1 && $*) || (sleep 2 && $*) || (sleep 4 && $*) || (sleep 8 && $*) } +# Detect architecture first +ARCH=$(uname -m) +echo "Detected architecture: $ARCH" + PLATFORM="" # TODO move this into the Docker images OS_NAME=$(awk -F= '/^NAME/{print $2}' /etc/os-release) if [[ "$OS_NAME" == *"AlmaLinux"* ]]; then retry yum install -q -y zip openssl - PLATFORM="manylinux_2_28_x86_64" + # Set platform based on architecture + case $ARCH in + x86_64) + PLATFORM="manylinux_2_28_x86_64" + ;; + aarch64) + PLATFORM="manylinux_2_28_aarch64" + ;; + s390x) + PLATFORM="manylinux_2_28_s390x" + ;; + *) + echo "Unsupported architecture: $ARCH" + exit 1 + ;; + esac elif [[ "$OS_NAME" == *"Red Hat Enterprise Linux"* ]]; then retry dnf install -q -y zip openssl elif [[ "$OS_NAME" == *"Ubuntu"* ]]; then @@ -38,6 +57,8 @@ else exit 1 fi +echo "Platform set to: $PLATFORM" + # We use the package name to test the package by passing this to 'pip install' # This is the env variable that setup.py uses to name the package. Note that # pip 'normalizes' the name first by changing all - to _ @@ -299,8 +320,8 @@ for pkg in /$WHEELHOUSE_DIR/torch_no_python*.whl /$WHEELHOUSE_DIR/torch*linux*.w # ROCm workaround for roctracer dlopens if [[ "$DESIRED_CUDA" == *"rocm"* ]]; then patchedpath=$(fname_without_so_number $destpath) - # Keep the so number for XPU dependencies and libgomp.so.1 to avoid twice load - elif [[ "$DESIRED_CUDA" == *"xpu"* || "$filename" == "libgomp.so.1" ]]; then + # Keep the so number for XPU dependencies, libgomp.so.1, ACL libraries, and NVPL libraries to avoid twice load + elif [[ "$DESIRED_CUDA" == *"xpu"* || "$filename" == "libgomp.so.1" || "$filename" == libarm_compute* || "$filename" == libnvpl* || "$filename" == "libgfortran.so.5" ]]; then patchedpath=$destpath else patchedpath=$(fname_with_sha256 $destpath) @@ -346,9 +367,22 @@ for pkg in /$WHEELHOUSE_DIR/torch_no_python*.whl /$WHEELHOUSE_DIR/torch*linux*.w done # create Manylinux 2_28 tag this needs to happen before regenerate the RECORD - if [[ $PLATFORM == "manylinux_2_28_x86_64" && $GPU_ARCH_TYPE != "cpu-s390x" && $GPU_ARCH_TYPE != "xpu" ]]; then + # Support all architectures (x86_64, aarch64, s390x) + if [[ "$IS_MANYLINUX2_28" == "1" && $GPU_ARCH_TYPE != "xpu" ]]; then wheel_file=$(echo $(basename $pkg) | sed -e 's/-cp.*$/.dist-info\/WHEEL/g') - sed -i -e s#linux_x86_64#"${PLATFORM}"# $wheel_file; + echo "Updating wheel tag for $ARCH architecture" + # Replace linux_* with manylinux_2_28_* based on architecture + case $ARCH in + x86_64) + sed -i -e 's#linux_x86_64#manylinux_2_28_x86_64#g' $wheel_file + ;; + aarch64) + sed -i -e 's#linux_aarch64#manylinux_2_28_aarch64#g' $wheel_file + ;; + s390x) + sed -i -e 's#linux_s390x#manylinux_2_28_s390x#g' $wheel_file + ;; + esac fi # regenerate the RECORD file with new hashes diff --git a/.ci/manywheel/build_cpu.sh b/.ci/manywheel/build_cpu.sh index 9d982bd30e25a..c3ddba33cd946 100755 --- a/.ci/manywheel/build_cpu.sh +++ b/.ci/manywheel/build_cpu.sh @@ -15,6 +15,10 @@ if [[ -z "$EXTRA_CAFFE2_CMAKE_FLAGS" ]]; then EXTRA_CAFFE2_CMAKE_FLAGS=() fi +# Detect architecture +ARCH=$(uname -m) +echo "Building CPU wheel for architecture: $ARCH" + WHEELHOUSE_DIR="wheelhousecpu" LIBTORCH_HOUSE_DIR="libtorch_housecpu" if [[ -z "$PYTORCH_FINAL_PACKAGE_DIR" ]]; then @@ -34,8 +38,10 @@ elif [[ "$OS_NAME" == *"Red Hat Enterprise Linux"* ]]; then elif [[ "$OS_NAME" == *"AlmaLinux"* ]]; then LIBGOMP_PATH="/usr/lib64/libgomp.so.1" elif [[ "$OS_NAME" == *"Ubuntu"* ]]; then - if [[ "$(uname -m)" == "s390x" ]]; then + if [[ "$ARCH" == "s390x" ]]; then LIBGOMP_PATH="/usr/lib/s390x-linux-gnu/libgomp.so.1" + elif [[ "$ARCH" == "aarch64" ]]; then + LIBGOMP_PATH="/usr/lib/aarch64-linux-gnu/libgomp.so.1" else LIBGOMP_PATH="/usr/lib/x86_64-linux-gnu/libgomp.so.1" fi @@ -49,6 +55,32 @@ DEPS_SONAME=( "libgomp.so.1" ) +# Add ARM-specific library dependencies for CPU builds +if [[ "$ARCH" == "aarch64" ]]; then + echo "Adding ARM-specific CPU library dependencies" + + # ARM Compute Library (if available) + if [[ -d "/acl/build" ]]; then + echo "Adding ARM Compute Library for CPU" + DEPS_LIST+=( + "/acl/build/libarm_compute.so" + "/acl/build/libarm_compute_graph.so" + ) + DEPS_SONAME+=( + "libarm_compute.so" + "libarm_compute_graph.so" + ) + fi + + # ARM system libraries + DEPS_LIST+=( + "/usr/lib64/libgfortran.so.5" + ) + DEPS_SONAME+=( + "libgfortran.so.5" + ) +fi + rm -rf /usr/local/cuda* SOURCE_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null && pwd )" diff --git a/.ci/manywheel/build_cuda.sh b/.ci/manywheel/build_cuda.sh index 2a822295e0361..260772ffe8023 100644 --- a/.ci/manywheel/build_cuda.sh +++ b/.ci/manywheel/build_cuda.sh @@ -29,6 +29,10 @@ if [[ -z "$EXTRA_CAFFE2_CMAKE_FLAGS" ]]; then EXTRA_CAFFE2_CMAKE_FLAGS=() fi +# Detect architecture +ARCH=$(uname -m) +echo "Building for architecture: $ARCH" + # Determine CUDA version and architectures to build for # # NOTE: We should first check `DESIRED_CUDA` when determining `CUDA_VERSION`, @@ -53,34 +57,60 @@ fi cuda_version_nodot=$(echo $CUDA_VERSION | tr -d '.') EXTRA_CAFFE2_CMAKE_FLAGS+=("-DATEN_NO_TEST=ON") +# Function to remove architectures from a list +remove_archs() { + local result="$1" + shift + for arch in "$@"; do + result="${result//${arch};/}" + done + echo "$result" +} + +# Function to filter CUDA architectures for aarch64 +# aarch64 ARM GPUs only support certain compute capabilities +# Keep: 8.0 (A100), 9.0+ (Hopper, Grace Hopper, newer) +# Remove: < 8.0 (no ARM GPUs), 8.6 (x86_64 RTX 3090/A6000 only) +filter_aarch64_archs() { + local arch_list="$1" + # Explicitly remove architectures not needed on aarch64 + arch_list=$(remove_archs "$arch_list" "5.0" "6.0" "7.0" "7.5" "8.6") + echo "$arch_list" +} + +# Base: Common architectures across all modern CUDA versions +TORCH_CUDA_ARCH_LIST="7.0;7.5;8.0;8.6;9.0" + case ${CUDA_VERSION} in - #removing sm_50-sm_60 as these architectures are deprecated in CUDA 12.8/9 and will be removed in future releases - #however we would like to keep sm_70 architecture see: https://github.com/pytorch/pytorch/issues/157517 - 12.8) - TORCH_CUDA_ARCH_LIST="7.0;7.5;8.0;8.6;9.0;10.0;12.0" - ;; - 12.9) - TORCH_CUDA_ARCH_LIST="7.0;7.5;8.0;8.6;9.0;10.0;12.0+PTX" - # WAR to resolve the ld error in libtorch build with CUDA 12.9 + 12.6) TORCH_CUDA_ARCH_LIST="5.0;6.0;${TORCH_CUDA_ARCH_LIST}" ;; # Only 12.6 includes Legacy Maxwell/Pascal that will be removed in future releases + 12.8) TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST};10.0;12.0" ;; # +Hopper/Blackwell support + 12.9) TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST};10.0;12.0+PTX" # +Hopper/Blackwell support + PTX for forward compatibility if [[ "$PACKAGE_TYPE" == "libtorch" ]]; then - TORCH_CUDA_ARCH_LIST="7.5;8.0;9.0;10.0;12.0+PTX" + TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST//7.0;/}" # Remove 7.0 to resolve the ld error + TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST//8.6;/}" # Remove 8.6 for libtorch fi ;; 13.0) - TORCH_CUDA_ARCH_LIST="7.5;8.0;8.6;9.0;10.0;12.0+PTX" - ;; - 12.6) - TORCH_CUDA_ARCH_LIST="5.0;6.0;7.0;7.5;8.0;8.6;9.0" - ;; - *) - echo "unknown cuda version $CUDA_VERSION" - exit 1 + TORCH_CUDA_ARCH_LIST="7.5;8.0;8.6;9.0;10.0;$([[ "$ARCH" == "aarch64" ]] && echo "11.0;" || echo "")12.0+PTX" + export TORCH_NVCC_FLAGS="-compress-mode=size" + export BUILD_BUNDLE_PTXAS=1 ;; + *) echo "unknown cuda version $CUDA_VERSION"; exit 1 ;; esac +# Filter for aarch64: Remove < 8.0 and 8.6 +[[ "$ARCH" == "aarch64" ]] && TORCH_CUDA_ARCH_LIST=$(filter_aarch64_archs "$TORCH_CUDA_ARCH_LIST") + +echo "TORCH_CUDA_ARCH_LIST set to: $TORCH_CUDA_ARCH_LIST" export TORCH_CUDA_ARCH_LIST=${TORCH_CUDA_ARCH_LIST} echo "${TORCH_CUDA_ARCH_LIST}" +# Disable MAGMA for aarch64 as pre-built libraries are x86-64 only +if [[ "$ARCH" == "aarch64" ]]; then + echo "Disabling MAGMA for aarch64 architecture" + export USE_MAGMA=0 +fi + # Package directories WHEELHOUSE_DIR="wheelhouse$cuda_version_nodot" LIBTORCH_HOUSE_DIR="libtorch_house$cuda_version_nodot" @@ -244,6 +274,51 @@ else exit 1 fi +# Add ARM-specific library dependencies +if [[ "$ARCH" == "aarch64" ]]; then + echo "Adding ARM-specific library dependencies" + + # ARM Compute Library (if available) + if [[ -d "/acl/build" ]]; then + echo "Adding ARM Compute Library" + DEPS_LIST+=( + "/acl/build/libarm_compute.so" + "/acl/build/libarm_compute_graph.so" + ) + DEPS_SONAME+=( + "libarm_compute.so" + "libarm_compute_graph.so" + ) + fi + + # ARM system libraries + DEPS_LIST+=( + "/lib64/libgomp.so.1" + "/usr/lib64/libgfortran.so.5" + ) + DEPS_SONAME+=( + "libgomp.so.1" + "libgfortran.so.5" + ) + + # NVPL libraries (ARM optimized BLAS/LAPACK) + if [[ -d "/usr/local/lib" && -f "/usr/local/lib/libnvpl_blas_lp64_gomp.so.0" ]]; then + echo "Adding NVPL libraries for ARM" + DEPS_LIST+=( + "/usr/local/lib/libnvpl_lapack_lp64_gomp.so.0" + "/usr/local/lib/libnvpl_blas_lp64_gomp.so.0" + "/usr/local/lib/libnvpl_lapack_core.so.0" + "/usr/local/lib/libnvpl_blas_core.so.0" + ) + DEPS_SONAME+=( + "libnvpl_lapack_lp64_gomp.so.0" + "libnvpl_blas_lp64_gomp.so.0" + "libnvpl_lapack_core.so.0" + "libnvpl_blas_core.so.0" + ) + fi +fi + # run_tests.sh requires DESIRED_CUDA to know what tests to exclude export DESIRED_CUDA="$cuda_version_nodot" @@ -251,9 +326,11 @@ export DESIRED_CUDA="$cuda_version_nodot" rm -rf /usr/local/cuda || true ln -s "/usr/local/cuda-${CUDA_VERSION}" /usr/local/cuda -# Switch `/usr/local/magma` to the desired CUDA version -rm -rf /usr/local/magma || true -ln -s /usr/local/cuda-${CUDA_VERSION}/magma /usr/local/magma +# Switch `/usr/local/magma` to the desired CUDA version (skip for aarch64) +if [[ "$ARCH" != "aarch64" ]]; then + rm -rf /usr/local/magma || true + ln -s /usr/local/cuda-${CUDA_VERSION}/magma /usr/local/magma +fi export CUDA_VERSION=$(ls /usr/local/cuda/lib64/libcudart.so.*|sort|tac | head -1 | rev | cut -d"." -f -3 | rev) # 10.0.130 export CUDA_VERSION_SHORT=$(ls /usr/local/cuda/lib64/libcudart.so.*|sort|tac | head -1 | rev | cut -d"." -f -3 | rev | cut -f1,2 -d".") # 10.0 diff --git a/.ci/pytorch/build.sh b/.ci/pytorch/build.sh index d66aa1120fb30..a8d2520065084 100755 --- a/.ci/pytorch/build.sh +++ b/.ci/pytorch/build.sh @@ -86,10 +86,20 @@ else fi fi +# Enable MKLDNN with ARM Compute Library for ARM builds if [[ "$BUILD_ENVIRONMENT" == *aarch64* ]]; then export USE_MKLDNN=1 + + # ACL is required for aarch64 builds + if [[ ! -d "/acl" ]]; then + echo "ERROR: ARM Compute Library not found at /acl" + echo "ACL is required for aarch64 builds. Check Docker image setup." + exit 1 + fi + export USE_MKLDNN_ACL=1 export ACL_ROOT_DIR=/acl + echo "ARM Compute Library enabled for MKLDNN: ACL_ROOT_DIR=/acl" fi if [[ "$BUILD_ENVIRONMENT" == *riscv64* ]]; then @@ -168,14 +178,16 @@ if [[ "$BUILD_ENVIRONMENT" == *xpu* ]]; then # shellcheck disable=SC1091 source /opt/intel/oneapi/compiler/latest/env/vars.sh # shellcheck disable=SC1091 + source /opt/intel/oneapi/umf/latest/env/vars.sh + # shellcheck disable=SC1091 source /opt/intel/oneapi/ccl/latest/env/vars.sh # shellcheck disable=SC1091 source /opt/intel/oneapi/mpi/latest/env/vars.sh + # shellcheck disable=SC1091 + source /opt/intel/oneapi/pti/latest/env/vars.sh # Enable XCCL build export USE_XCCL=1 export USE_MPI=0 - # XPU kineto feature dependencies are not fully ready, disable kineto build as temp WA - export USE_KINETO=0 export TORCH_XPU_ARCH_LIST=pvc fi diff --git a/.ci/pytorch/common_utils.sh b/.ci/pytorch/common_utils.sh index 9c9d223777466..323ac6cacd889 100644 --- a/.ci/pytorch/common_utils.sh +++ b/.ci/pytorch/common_utils.sh @@ -96,7 +96,6 @@ function pip_build_and_install() { python3 -m pip wheel \ --no-build-isolation \ --no-deps \ - --no-use-pep517 \ -w "${wheel_dir}" \ "${build_target}" fi @@ -308,6 +307,28 @@ function install_torchao() { pip_build_and_install "git+https://github.com/pytorch/ao.git@${commit}" dist/ao } +function install_flash_attn_cute() { + echo "Installing FlashAttention CuTe from GitHub..." + # Grab latest main til we have a pinned commit + local flash_attn_commit + flash_attn_commit=$(git ls-remote https://github.com/Dao-AILab/flash-attention.git HEAD | cut -f1) + + # Clone the repo to a temporary directory + rm -rf flash-attention-build + git clone --depth 1 --recursive https://github.com/Dao-AILab/flash-attention.git flash-attention-build + + pushd flash-attention-build + git checkout "${flash_attn_commit}" + + # Install only the 'cute' sub-directory + pip_install -e flash_attn/cute/ + popd + + # remove the local repo + rm -rf flash-attention-build + echo "FlashAttention CuTe installation complete." +} + function print_sccache_stats() { echo 'PyTorch Build Statistics' sccache --show-stats diff --git a/.ci/pytorch/python_doc_push_script.sh b/.ci/pytorch/python_doc_push_script.sh index ec1187b3fe4c4..6bcd46c4815a6 100755 --- a/.ci/pytorch/python_doc_push_script.sh +++ b/.ci/pytorch/python_doc_push_script.sh @@ -89,23 +89,41 @@ if [ "$is_main_doc" = true ]; then make coverage # Now we have the coverage report, we need to make sure it is empty. - # Count the number of lines in the file and turn that number into a variable - # $lines. The `cut -f1 ...` is to only parse the number, not the filename - # Skip the report header by subtracting 2: the header will be output even if - # there are no undocumented items. + # Sphinx 7.2.6+ format: python.txt contains a statistics table with a TOTAL row + # showing the undocumented count in the third column. + # Example: | TOTAL | 99.83% | 2 | # # Also: see docs/source/conf.py for "coverage_ignore*" items, which should # be documented then removed from there. - lines=$(wc -l build/coverage/python.txt 2>/dev/null |cut -f1 -d' ') - undocumented=$((lines - 2)) - if [ $undocumented -lt 0 ]; then + + # Extract undocumented count from TOTAL row in Sphinx 7.2.6 statistics table + # The table format is: | Module | Coverage | Undocumented | + # Extract the third column (undocumented count) from the TOTAL row + undocumented=$(grep "| TOTAL" build/coverage/python.txt | awk -F'|' '{print $4}' | tr -d ' ') + + if [ -z "$undocumented" ] || ! [[ "$undocumented" =~ ^[0-9]+$ ]]; then echo coverage output not found exit 1 - elif [ $undocumented -gt 0 ]; then - echo undocumented objects found: - cat build/coverage/python.txt + elif [ "$undocumented" -gt 0 ]; then + set +x # Disable command echoing for cleaner output + echo "" + echo "=====================" + echo "UNDOCUMENTED OBJECTS:" + echo "=====================" + echo "" + # Find the line number of the TOTAL row and print only what comes after it + total_line=$(grep -n "| TOTAL" build/coverage/python.txt | cut -d: -f1) + if [ -n "$total_line" ]; then + # Print only the detailed list (skip the statistics table) + tail -n +$((total_line + 2)) build/coverage/python.txt + else + # Fallback to showing entire file if TOTAL line not found + cat build/coverage/python.txt + fi + echo "" echo "Make sure you've updated relevant .rsts in docs/source!" - echo "You can reproduce locally by running 'cd docs && make coverage && cat build/coverage/python.txt'" + echo "You can reproduce locally by running 'cd docs && make coverage && tail -n +\$((grep -n \"| TOTAL\" build/coverage/python.txt | cut -d: -f1) + 2)) build/coverage/python.txt'" + set -x # Re-enable command echoing exit 1 fi else diff --git a/.ci/pytorch/smoke_test/check_binary_symbols.py b/.ci/pytorch/smoke_test/check_binary_symbols.py index b0c607659c72d..51d5174e77912 100755 --- a/.ci/pytorch/smoke_test/check_binary_symbols.py +++ b/.ci/pytorch/smoke_test/check_binary_symbols.py @@ -100,6 +100,337 @@ def check_lib_statically_linked_libstdc_cxx_abi_symbols(lib: str) -> None: ) +def _compile_and_extract_symbols( + cpp_content: str, compile_flags: list[str], exclude_list: list[str] | None = None +) -> list[str]: + """ + Helper to compile a C++ file and extract all symbols. + + Args: + cpp_content: C++ source code to compile + compile_flags: Compilation flags + exclude_list: List of symbol names to exclude. Defaults to ["main"]. + + Returns: + List of all symbols found in the object file (excluding those in exclude_list). + """ + import subprocess + import tempfile + + if exclude_list is None: + exclude_list = ["main"] + + with tempfile.TemporaryDirectory() as tmpdir: + tmppath = Path(tmpdir) + cpp_file = tmppath / "test.cpp" + obj_file = tmppath / "test.o" + + cpp_file.write_text(cpp_content) + + result = subprocess.run( + compile_flags + [str(cpp_file), "-o", str(obj_file)], + capture_output=True, + text=True, + timeout=60, + ) + + if result.returncode != 0: + raise RuntimeError(f"Compilation failed: {result.stderr}") + + symbols = get_symbols(str(obj_file)) + + # Return all symbol names, excluding those in the exclude list + return [name for _addr, _stype, name in symbols if name not in exclude_list] + + +def check_stable_only_symbols(install_root: Path) -> None: + """ + Test TORCH_STABLE_ONLY and TORCH_TARGET_VERSION by compiling test code and comparing symbol counts. + + This approach tests: + 1. WITHOUT macros -> many torch symbols exposed + 2. WITH TORCH_STABLE_ONLY -> zero torch symbols (all hidden) + 3. WITH TORCH_TARGET_VERSION -> zero torch symbols (all hidden) + 4. WITH both macros -> zero torch symbols (all hidden) + """ + include_dir = install_root / "include" + assert include_dir.exists(), f"Expected {include_dir} to be present" + + test_cpp_content = """ +// Main torch C++ API headers +#include +#include + +// ATen tensor library +#include + +// Core c10 headers (commonly used) +#include +#include +#include +#include +#include + +int main() { return 0; } +""" + + base_compile_flags = [ + "g++", + "-std=c++17", + f"-I{include_dir}", + f"-I{include_dir}/torch/csrc/api/include", + "-c", # Compile only, don't link + ] + + # Compile WITHOUT any macros + symbols_without = _compile_and_extract_symbols( + cpp_content=test_cpp_content, + compile_flags=base_compile_flags, + ) + + # We expect constexpr symbols, inline functions used by other headers etc. + # to produce symbols + num_symbols_without = len(symbols_without) + print(f"Found {num_symbols_without} symbols without any macros defined") + assert num_symbols_without != 0, ( + "Expected a non-zero number of symbols without any macros" + ) + + # Compile WITH TORCH_STABLE_ONLY (expect 0 symbols) + compile_flags_with_stable_only = base_compile_flags + ["-DTORCH_STABLE_ONLY"] + + symbols_with_stable_only = _compile_and_extract_symbols( + cpp_content=test_cpp_content, + compile_flags=compile_flags_with_stable_only, + ) + + num_symbols_with_stable_only = len(symbols_with_stable_only) + assert num_symbols_with_stable_only == 0, ( + f"Expected no symbols with TORCH_STABLE_ONLY macro, but found {num_symbols_with_stable_only}" + ) + + # Compile WITH TORCH_TARGET_VERSION (expect 0 symbols) + compile_flags_with_target_version = base_compile_flags + [ + "-DTORCH_TARGET_VERSION=1" + ] + + symbols_with_target_version = _compile_and_extract_symbols( + cpp_content=test_cpp_content, + compile_flags=compile_flags_with_target_version, + ) + + num_symbols_with_target_version = len(symbols_with_target_version) + assert num_symbols_with_target_version == 0, ( + f"Expected no symbols with TORCH_TARGET_VERSION macro, but found {num_symbols_with_target_version}" + ) + + # Compile WITH both macros (expect 0 symbols) + compile_flags_with_both = base_compile_flags + [ + "-DTORCH_STABLE_ONLY", + "-DTORCH_TARGET_VERSION=1", + ] + + symbols_with_both = _compile_and_extract_symbols( + cpp_content=test_cpp_content, + compile_flags=compile_flags_with_both, + ) + + num_symbols_with_both = len(symbols_with_both) + assert num_symbols_with_both == 0, ( + f"Expected no symbols with both macros, but found {num_symbols_with_both}" + ) + + +def check_stable_api_symbols(install_root: Path) -> None: + """ + Test that stable API headers still expose symbols with TORCH_STABLE_ONLY. + The torch/csrc/stable/c/shim.h header is tested in check_stable_c_shim_symbols + """ + include_dir = install_root / "include" + assert include_dir.exists(), f"Expected {include_dir} to be present" + + stable_dir = include_dir / "torch" / "csrc" / "stable" + assert stable_dir.exists(), f"Expected {stable_dir} to be present" + + stable_headers = list(stable_dir.rglob("*.h")) + if not stable_headers: + raise RuntimeError("Could not find any stable headers") + + includes = [] + for header in stable_headers: + rel_path = header.relative_to(include_dir) + includes.append(f"#include <{rel_path.as_posix()}>") + + includes_str = "\n".join(includes) + test_stable_content = f""" +{includes_str} +int main() {{ return 0; }} +""" + + compile_flags = [ + "g++", + "-std=c++17", + f"-I{include_dir}", + f"-I{include_dir}/torch/csrc/api/include", + "-c", + "-DTORCH_STABLE_ONLY", + ] + + symbols_stable = _compile_and_extract_symbols( + cpp_content=test_stable_content, + compile_flags=compile_flags, + ) + num_symbols_stable = len(symbols_stable) + print(f"Found {num_symbols_stable} symbols in torch/csrc/stable") + assert num_symbols_stable > 0, ( + f"Expected stable headers to expose symbols with TORCH_STABLE_ONLY, " + f"but found {num_symbols_stable} symbols" + ) + + +def check_headeronly_symbols(install_root: Path) -> None: + """ + Test that header-only utility headers still expose symbols with TORCH_STABLE_ONLY. + """ + include_dir = install_root / "include" + assert include_dir.exists(), f"Expected {include_dir} to be present" + + # Find all headers in torch/headeronly + headeronly_dir = include_dir / "torch" / "headeronly" + assert headeronly_dir.exists(), f"Expected {headeronly_dir} to be present" + headeronly_headers = list(headeronly_dir.rglob("*.h")) + if not headeronly_headers: + raise RuntimeError("Could not find any headeronly headers") + + # Filter out platform-specific headers that may not compile everywhere + platform_specific_keywords = [ + "cpu/vec", + ] + + filtered_headers = [] + for header in headeronly_headers: + rel_path = header.relative_to(include_dir).as_posix() + if not any( + keyword in rel_path.lower() for keyword in platform_specific_keywords + ): + filtered_headers.append(header) + + includes = [] + for header in filtered_headers: + rel_path = header.relative_to(include_dir) + includes.append(f"#include <{rel_path.as_posix()}>") + + includes_str = "\n".join(includes) + test_headeronly_content = f""" +{includes_str} +int main() {{ return 0; }} +""" + + compile_flags = [ + "g++", + "-std=c++17", + f"-I{include_dir}", + f"-I{include_dir}/torch/csrc/api/include", + "-c", + "-DTORCH_STABLE_ONLY", + ] + + symbols_headeronly = _compile_and_extract_symbols( + cpp_content=test_headeronly_content, + compile_flags=compile_flags, + ) + num_symbols_headeronly = len(symbols_headeronly) + print(f"Found {num_symbols_headeronly} symbols in torch/headeronly") + assert num_symbols_headeronly > 0, ( + f"Expected headeronly headers to expose symbols with TORCH_STABLE_ONLY, " + f"but found {num_symbols_headeronly} symbols" + ) + + +def check_aoti_shim_symbols(install_root: Path) -> None: + """ + Test that AOTI shim headers still expose symbols with TORCH_STABLE_ONLY. + """ + include_dir = install_root / "include" + assert include_dir.exists(), f"Expected {include_dir} to be present" + + # There are no constexpr symbols etc., so we need to actually use functions + # so that some symbols are found. + test_shim_content = """ +#include +int main() { + int32_t (*fp1)() = &aoti_torch_device_type_cpu; + int32_t (*fp2)() = &aoti_torch_dtype_float32; + (void)fp1; (void)fp2; + return 0; +} +""" + + compile_flags = [ + "g++", + "-std=c++17", + f"-I{include_dir}", + f"-I{include_dir}/torch/csrc/api/include", + "-c", + "-DTORCH_STABLE_ONLY", + ] + + symbols_shim = _compile_and_extract_symbols( + cpp_content=test_shim_content, + compile_flags=compile_flags, + ) + num_symbols_shim = len(symbols_shim) + assert num_symbols_shim > 0, ( + f"Expected shim headers to expose symbols with TORCH_STABLE_ONLY, " + f"but found {num_symbols_shim} symbols" + ) + + +def check_stable_c_shim_symbols(install_root: Path) -> None: + """ + Test that stable C shim headers still expose symbols with TORCH_STABLE_ONLY. + """ + include_dir = install_root / "include" + assert include_dir.exists(), f"Expected {include_dir} to be present" + + # Check if the stable C shim exists + stable_shim = include_dir / "torch" / "csrc" / "stable" / "c" / "shim.h" + if not stable_shim.exists(): + raise RuntimeError("Could not find stable c shim") + + # There are no constexpr symbols etc., so we need to actually use functions + # so that some symbols are found. + test_stable_shim_content = """ +#include +int main() { + // Reference stable C API functions to create undefined symbols + AOTITorchError (*fp1)(const char*, uint32_t*, int32_t*) = &torch_parse_device_string; + AOTITorchError (*fp2)(uint32_t*) = &torch_get_num_threads; + (void)fp1; (void)fp2; + return 0; +} +""" + + compile_flags = [ + "g++", + "-std=c++17", + f"-I{include_dir}", + f"-I{include_dir}/torch/csrc/api/include", + "-c", + "-DTORCH_STABLE_ONLY", + ] + + symbols_stable_shim = _compile_and_extract_symbols( + cpp_content=test_stable_shim_content, + compile_flags=compile_flags, + ) + num_symbols_stable_shim = len(symbols_stable_shim) + assert num_symbols_stable_shim > 0, ( + f"Expected stable C shim headers to expose symbols with TORCH_STABLE_ONLY, " + f"but found {num_symbols_stable_shim} symbols" + ) + + def check_lib_symbols_for_abi_correctness(lib: str) -> None: print(f"lib: {lib}") cxx11_symbols = grep_symbols(lib, LIBTORCH_CXX11_PATTERNS) @@ -129,6 +460,13 @@ def main() -> None: check_lib_symbols_for_abi_correctness(libtorch_cpu_path) check_lib_statically_linked_libstdc_cxx_abi_symbols(libtorch_cpu_path) + # Check symbols when TORCH_STABLE_ONLY is defined + check_stable_only_symbols(install_root) + check_stable_api_symbols(install_root) + check_headeronly_symbols(install_root) + check_aoti_shim_symbols(install_root) + check_stable_c_shim_symbols(install_root) + if __name__ == "__main__": main() diff --git a/.ci/pytorch/smoke_test/smoke_test.py b/.ci/pytorch/smoke_test/smoke_test.py index 675d58a3e283d..e760340bebb12 100644 --- a/.ci/pytorch/smoke_test/smoke_test.py +++ b/.ci/pytorch/smoke_test/smoke_test.py @@ -353,6 +353,17 @@ def test_linalg(device="cpu") -> None: torch.linalg.svd(A) +def test_sdpa(device="cpu", dtype=torch.float16) -> None: + """Regression test for https://github.com/pytorch/pytorch/issues/167602 + Without nvrtc_builtins on CuDNN-9.13 on CUDA-13 fails with ` No valid execution plans built.` + """ + print(f"Testing SDPA on {device} using type {dtype}") + k, q, v = torch.rand(3, 1, 16, 77, 64, dtype=dtype, device=device).unbind(0) + attn = torch.rand(1, 1, 77, 77, dtype=dtype, device=device) + rc = torch.nn.functional.scaled_dot_product_attention(q, k, v, attn) + assert rc.isnan().any().item() is False + + def smoke_test_compile(device: str = "cpu") -> None: supported_dtypes = [torch.float16, torch.float32, torch.float64] @@ -489,10 +500,12 @@ def main() -> None: smoke_test_conv2d() test_linalg() test_numpy() + test_sdpa() if is_cuda_system: test_linalg("cuda") test_cuda_gds_errors_captured() + test_sdpa("cuda") if options.package == "all": smoke_test_modules() diff --git a/.ci/pytorch/test.sh b/.ci/pytorch/test.sh index 9ae2578758939..687ec4b9e0ae4 100755 --- a/.ci/pytorch/test.sh +++ b/.ci/pytorch/test.sh @@ -208,6 +208,8 @@ if [[ "$BUILD_ENVIRONMENT" == *xpu* ]]; then source /opt/intel/oneapi/ccl/latest/env/vars.sh # shellcheck disable=SC1091 source /opt/intel/oneapi/mpi/latest/env/vars.sh + # shellcheck disable=SC1091 + source /opt/intel/oneapi/pti/latest/env/vars.sh # Check XPU status before testing timeout 30 xpu-smi discovery || true fi @@ -342,8 +344,18 @@ test_python_smoke() { } test_python_smoke_b200() { - # Targeted smoke tests for B200 - staged approach to avoid too many failures - time python test/run_test.py --include test_matmul_cuda test_scaled_matmul_cuda inductor/test_fp8 $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running + # Targeted smoke tests for B200 including FlashAttention CuTe coverage + install_flash_attn_cute + time python test/run_test.py \ + --include \ + test_matmul_cuda \ + test_scaled_matmul_cuda \ + inductor/test_fp8 \ + nn/attention/test_fa4 \ + nn/attention/test_open_registry \ + inductor/test_flex_flash \ + $PYTHON_TEST_EXTRA_OPTION \ + --upload-artifacts-while-running assert_git_not_dirty } @@ -824,6 +836,11 @@ test_inductor_halide() { assert_git_not_dirty } +test_inductor_pallas() { + python test/run_test.py --include inductor/test_pallas.py --verbose + assert_git_not_dirty +} + test_inductor_triton_cpu() { python test/run_test.py --include inductor/test_triton_cpu_backend.py inductor/test_torchinductor_strided_blocks.py --verbose assert_git_not_dirty @@ -1663,6 +1680,22 @@ test_operator_microbenchmark() { done } +test_attention_microbenchmark() { + TEST_REPORTS_DIR=$(pwd)/test/test-reports + mkdir -p "$TEST_REPORTS_DIR" + TEST_DIR=$(pwd) + + # Install attention-gym dependency + echo "Installing attention-gym..." + python -m pip install git+https://github.com/meta-pytorch/attention-gym.git@main + pip show triton + + cd "${TEST_DIR}"/benchmarks/transformer + + $TASKSET python score_mod.py --config configs/config_basic.yaml \ + --output-json-for-dashboard "${TEST_REPORTS_DIR}/attention_microbenchmark.json" +} + if ! [[ "${BUILD_ENVIRONMENT}" == *libtorch* || "${BUILD_ENVIRONMENT}" == *-bazel-* ]]; then (cd test && python -c "import torch; print(torch.__config__.show())") (cd test && python -c "import torch; print(torch.__config__.parallel_info())") @@ -1720,10 +1753,14 @@ elif [[ "${TEST_CONFIG}" == *operator_benchmark* ]]; then fi elif [[ "${TEST_CONFIG}" == *operator_microbenchmark* ]]; then test_operator_microbenchmark +elif [[ "${TEST_CONFIG}" == *attention_microbenchmark* ]]; then + test_attention_microbenchmark elif [[ "${TEST_CONFIG}" == *inductor_distributed* ]]; then test_inductor_distributed elif [[ "${TEST_CONFIG}" == *inductor-halide* ]]; then test_inductor_halide +elif [[ "${TEST_CONFIG}" == *inductor-pallas* ]]; then + test_inductor_pallas elif [[ "${TEST_CONFIG}" == *inductor-triton-cpu* ]]; then test_inductor_triton_cpu elif [[ "${TEST_CONFIG}" == *inductor-micro-benchmark* ]]; then diff --git a/.ci/pytorch/win-test-helpers/arm64/build_pytorch.ps1 b/.ci/pytorch/win-test-helpers/arm64/build_pytorch.ps1 index a165f2a222caf..f0eabed170d25 100644 --- a/.ci/pytorch/win-test-helpers/arm64/build_pytorch.ps1 +++ b/.ci/pytorch/win-test-helpers/arm64/build_pytorch.ps1 @@ -70,7 +70,7 @@ sccache --zero-stats sccache --show-stats # Build the wheel -python -m build --wheel --no-build-isolation +python -m build --wheel --no-isolation if ($LASTEXITCODE -ne 0) { exit 1 } # Install the wheel locally diff --git a/.github/ISSUE_TEMPLATE/release-feature-request.yml b/.github/ISSUE_TEMPLATE/release-feature-request.yml index 80f10807ae56b..090a41d1942f6 100644 --- a/.github/ISSUE_TEMPLATE/release-feature-request.yml +++ b/.github/ISSUE_TEMPLATE/release-feature-request.yml @@ -1,11 +1,11 @@ -name: 🚀 Release highlight for proposed Feature +name: 🚀 New Feature for Release description: Submit a Release highlight for proposed Feature labels: ["release-feature-request"] body: - type: textarea attributes: - label: Release highlight for proposed Feature + label: New Feature for Release description: > Example: “A torch.special module, analogous to SciPy's special module.” - type: input diff --git a/.github/actionlint.yaml b/.github/actionlint.yaml index d021371ca8863..dfb30e155b162 100644 --- a/.github/actionlint.yaml +++ b/.github/actionlint.yaml @@ -63,7 +63,7 @@ self-hosted-runner: - linux.rocm.gpu.gfx942.1 - linux.rocm.gpu.gfx942.2 - linux.rocm.gpu.gfx942.4 - - rocm-docker + - linux.rocm.gfx942.docker-cache # Org wise AWS `mac2.metal` runners (2020 Mac mini hardware powered by Apple silicon M1 processors) - macos-m1-stable - macos-m1-14 diff --git a/.github/actions/pytest-cache-download/action.yml b/.github/actions/pytest-cache-download/action.yml index 1406f962c4ca8..3f51f6a5525bc 100644 --- a/.github/actions/pytest-cache-download/action.yml +++ b/.github/actions/pytest-cache-download/action.yml @@ -38,9 +38,9 @@ runs: run: | python3 .github/scripts/pytest_cache.py \ --download \ - --cache_dir $GITHUB_WORKSPACE/$CACHE_DIR \ - --pr_identifier $GITHUB_REF \ - --job_identifier $JOB_IDENTIFIER \ - --temp_dir $RUNNER_TEMP \ - --repo $REPO \ - --bucket $BUCKET \ + --cache_dir "$GITHUB_WORKSPACE/$CACHE_DIR" \ + --pr_identifier "$GITHUB_REF" \ + --job_identifier "$JOB_IDENTIFIER" \ + --temp_dir "$RUNNER_TEMP" \ + --repo "$REPO" \ + --bucket "$BUCKET" \ diff --git a/.github/actions/pytest-cache-upload/action.yml b/.github/actions/pytest-cache-upload/action.yml index 2652d019075f7..9fbb63a760f27 100644 --- a/.github/actions/pytest-cache-upload/action.yml +++ b/.github/actions/pytest-cache-upload/action.yml @@ -47,11 +47,11 @@ runs: run: | python3 .github/scripts/pytest_cache.py \ --upload \ - --cache_dir $GITHUB_WORKSPACE/$CACHE_DIR \ - --pr_identifier $GITHUB_REF \ - --job_identifier $JOB_IDENTIFIER \ - --sha $SHA \ - --test_config $TEST_CONFIG \ - --shard $SHARD \ - --repo $REPO \ - --temp_dir $RUNNER_TEMP \ + --cache_dir "$GITHUB_WORKSPACE/$CACHE_DIR" \ + --pr_identifier "$GITHUB_REF" \ + --job_identifier "$JOB_IDENTIFIER" \ + --sha "$SHA" \ + --test_config "$TEST_CONFIG" \ + --shard "$SHARD" \ + --repo "$REPO" \ + --temp_dir "$RUNNER_TEMP" \ diff --git a/.github/ci_commit_pins/audio.txt b/.github/ci_commit_pins/audio.txt index 966f6bcfc0d94..8462dd2aa4e55 100644 --- a/.github/ci_commit_pins/audio.txt +++ b/.github/ci_commit_pins/audio.txt @@ -1 +1 @@ -3b0e7a6f192ca2715e7e6cbe5db007aea7165fe2 +07b6cbde121417a70e4dc871adb6d27030e0ce3f diff --git a/.github/ci_commit_pins/vision.txt b/.github/ci_commit_pins/vision.txt index 183e9fb4b06e1..7e4dcdb5b18ab 100644 --- a/.github/ci_commit_pins/vision.txt +++ b/.github/ci_commit_pins/vision.txt @@ -1 +1 @@ -cfbc5c2f1c798991715a6b06bb3ce46478c4487c +acccf86477759b2d3500f1ae1be065f7b1e409ec diff --git a/.github/ci_commit_pins/xla.txt b/.github/ci_commit_pins/xla.txt index 01f0673fcf802..191c21631f662 100644 --- a/.github/ci_commit_pins/xla.txt +++ b/.github/ci_commit_pins/xla.txt @@ -1 +1 @@ -c8b09f5f77d6bf6fb7ed7a9aa83e5d8156b3a5e9 +e4d25697f9dc5eedaf8f0a5bf085c62c5455a53a diff --git a/.github/copilot-instructions.md b/.github/copilot-instructions.md new file mode 100644 index 0000000000000..06c3f32abd5e1 --- /dev/null +++ b/.github/copilot-instructions.md @@ -0,0 +1,125 @@ +# PyTorch Copilot Instructions + +This is the PyTorch machine learning framework codebase. These instructions help AI agents navigate and contribute effectively. + +## Architecture Overview + +### Core Components + +- **c10/** - Core library (C++-10 compatible) for essential, binary-size-conscious functionality +- **aten/** - ATen tensor library (C++), PyTorch's foundation without autograd + - `aten/src/ATen/native/` - Modern operator implementations (CPU/CUDA/MPS/sparse) + - `aten/src/ATen/native/native_functions.yaml` - **Critical**: Declarative operator registry +- **torch/** - Python bindings and public API + - `torch/csrc/` - C++ Python bindings (hand-written and generated) + - `torch/csrc/autograd/` - Reverse-mode automatic differentiation + - `torch/csrc/jit/` - TorchScript JIT compiler +- **torchgen/** - Code generation tooling that reads `native_functions.yaml` +- **tools/** - Build scripts, autograd derivatives, code generation + +### The Code Generation Workflow + +**Most operator changes require editing `native_functions.yaml`**, not direct C++ files. This YAML file: +1. Declares operator signatures, variants (function/method), and dispatch behavior +2. Gets processed by `torchgen/` to generate C++/Python bindings +3. Produces headers in `build/aten/src/ATen/` during compilation + +Example entry structure: +```yaml +- func: my_op(Tensor self, Scalar alpha=1) -> Tensor + variants: function, method + dispatch: + CPU: my_op_cpu + CUDA: my_op_cuda +``` + +After editing `native_functions.yaml`, implement kernels in `aten/src/ATen/native/` (see `aten/src/ATen/native/README.md`). + +## Development Workflows + +### Building from Source + +**Never run `setup.py` directly** - use pip with editable install: +```bash +python -m pip install --no-build-isolation -v -e . +``` + +Speed up builds: +- `DEBUG=1` - Debug symbols with `-g -O0` +- `USE_CUDA=0` - Skip CUDA compilation +- `BUILD_TEST=0` - Skip C++ test binaries +- Install `ninja` (`pip install ninja`) for faster builds +- Use `ccache` for incremental compilation caching + +Rebuild specific targets: `(cd build && ninja )` + +### Testing + +**Critical**: DO NOT run entire test suites. Run specific tests only: +```bash +python test/test_torch.py TestTorch.test_specific_case +``` + +**Test structure**: All tests use `torch.testing._internal.common_utils`: +```python +from torch.testing._internal.common_utils import run_tests, TestCase + +class TestFeature(TestCase): + def test_something(self): + # Use self.assertEqual for tensor comparisons + pass + +if __name__ == "__main__": + run_tests() +``` + +**For bug fixes**: Create a standalone reproduction script first, verify it fails, then fix and add to appropriate test file. + +### Linting + +Run linter (not pre-commit): `lintrunner -a` (auto-applies fixes) + +## Project-Specific Conventions + +### Memory and Storage +- **Storage is never nullptr** (but `StorageImpl.data` may be nullptr for unallocated outputs) +- CUDA device info lives in storage objects + +### Python-C++ Integration (`torch/csrc/`) +- Always include `Python.h` **first** to avoid `_XOPEN_SOURCE` redefinition errors +- Use `pybind11::gil_scoped_acquire` before calling Python API or using `THPObjectPtr` +- Wrap entry points with `HANDLE_TH_ERRORS` / `END_HANDLE_TH_ERRORS` for exception conversion + +### Dispatch System +- PyTorch uses operator dispatch to route calls to backend-specific kernels +- Prefer `CompositeExplicitAutograd` dispatch when writing device-agnostic compound ops +- See `aten/src/ATen/native/README.md` for dispatch keyword guidance + +## Git Workflow (AI Agent Specific) + +When preparing PRs from this environment: +```bash +git stash -u +git reset --hard $(cat /tmp/orig_work.txt) # Reset to LOCAL branch +git stash pop +# Resolve conflicts if necessary +``` + +## Common Gotchas + +1. **Editing generated files** - If it's in `build/`, don't edit it. Edit the source template or `native_functions.yaml` +2. **NVCC template compilation** - NVCC is stricter about C++ than gcc/clang; code working on Linux may fail Windows CI +3. **Windows symbol visibility** - Use `TORCH_API` macros for exported symbols (required on Windows, optional on Linux) +4. **No internet access** - DO NOT attempt to install dependencies during development + +## Key Files Reference + +- `AGENTS.md` - Instructions specific to AI coding agents +- `CONTRIBUTING.md` - Comprehensive human contributor guide +- `GLOSSARY.md` - Terminology (ATen, kernels, operations, JIT, TorchScript) +- `aten/src/ATen/native/README.md` - Operator implementation guide +- `tools/autograd/derivatives.yaml` - Gradient definitions for autograd + +## Performance Debugging + +Use `TORCH_SHOW_CPP_STACKTRACES=1` for C++ traces in Python errors. For profiling, prefer `py-spy` over manual instrumentation. diff --git a/.github/labeler.yml b/.github/labeler.yml index 7b47b9fefb5dc..e8d3c223af317 100644 --- a/.github/labeler.yml +++ b/.github/labeler.yml @@ -138,7 +138,8 @@ - test/test_matmul_cuda.py - test/test_scaled_matmul_cuda.py - test/inductor/test_fp8.py -- aten/src/ATen/native/cuda/Blas.cpp +- aten/src/ATen/native/cuda/*Blas.cpp +- aten/src/ATen/cuda/CUDA*Blas.* - torch/**/*cublas* - torch/_inductor/kernel/mm.py - test/inductor/test_max_autotune.py @@ -148,7 +149,8 @@ - test/test_matmul_cuda.py - test/test_scaled_matmul_cuda.py - test/inductor/test_fp8.py -- aten/src/ATen/native/cuda/Blas.cpp +- aten/src/ATen/native/cuda/*Blas.cpp +- aten/src/ATen/cuda/CUDA*Blas.* - torch/**/*cublas* - torch/_inductor/kernel/mm.py - test/inductor/test_max_autotune.py @@ -158,7 +160,21 @@ - test/test_matmul_cuda.py - test/test_scaled_matmul_cuda.py - test/inductor/test_fp8.py -- aten/src/ATen/native/cuda/Blas.cpp +- aten/src/ATen/native/cuda/*Blas.cpp +- aten/src/ATen/cuda/CUDA*Blas.* - torch/_inductor/kernel/mm.py - test/inductor/test_max_autotune.py - third_party/fbgemm + +"ciflow/mps": +- aten/src/ATen/mps/** +- aten/src/ATen/native/mps/** +- torch/_inductor/codegen/mps.py +- test/test_mps.py +- test/inductor/test_mps_basic.py + +"ciflow/h100-symm-mem": +- torch/csrc/distributed/c10d/symm_mem/** +- torch/distributed/_symmetric_memory/** +- test/distributed/**/*mem* +- test/distributed/**/*mem*/** diff --git a/.github/nitpicks.yml b/.github/nitpicks.yml index 1d08a36abf1d5..e3fe5d4725587 100644 --- a/.github/nitpicks.yml +++ b/.github/nitpicks.yml @@ -10,3 +10,4 @@ pathFilter: - 'torch/csrc/inductor/aoti_torch/c/*' - 'torch/csrc/inductor/aoti_torch/generated/*' + - 'torch/csrc/stable/c/*' diff --git a/.github/pytorch-probot.yml b/.github/pytorch-probot.yml index c15ba606398f6..8de0df02a132c 100644 --- a/.github/pytorch-probot.yml +++ b/.github/pytorch-probot.yml @@ -2,8 +2,8 @@ tracking_issue: 24422 ciflow_tracking_issue: 64124 ciflow_push_tags: - ciflow/b200 -- ciflow/b200-symm-mem - ciflow/b200-distributed +- ciflow/b200-symm-mem - ciflow/binaries - ciflow/binaries_libtorch - ciflow/binaries_wheel @@ -22,6 +22,8 @@ ciflow_push_tags: - ciflow/inductor-perf-test-nightly-xpu - ciflow/inductor-periodic - ciflow/inductor-rocm +- ciflow/inductor-rocm-mi200 +- ciflow/inductor-rocm-mi300 - ciflow/linux-aarch64 - ciflow/mps - ciflow/nightly @@ -33,11 +35,13 @@ ciflow_push_tags: - ciflow/quantization-periodic - ciflow/riscv64 - ciflow/rocm +- ciflow/rocm-mi200 - ciflow/rocm-mi300 - ciflow/rocm-mi355 - ciflow/rocm-navi31 - ciflow/s390 - ciflow/slow +- ciflow/slow-rocm-mi200 - ciflow/torchbench - ciflow/triton_binaries - ciflow/trunk diff --git a/.github/scripts/delete_old_branches.py b/.github/scripts/delete_old_branches.py index 8032008edf122..42cd851f8e338 100644 --- a/.github/scripts/delete_old_branches.py +++ b/.github/scripts/delete_old_branches.py @@ -1,10 +1,11 @@ # Delete old branches import os import re +from collections.abc import Callable from datetime import datetime from functools import lru_cache from pathlib import Path -from typing import Any, Callable +from typing import Any from github_utils import gh_fetch_json_dict, gh_graphql from gitutils import GitRepo diff --git a/.github/scripts/filter_test_configs.py b/.github/scripts/filter_test_configs.py index 592c7aab6d933..ee102d3f560f9 100755 --- a/.github/scripts/filter_test_configs.py +++ b/.github/scripts/filter_test_configs.py @@ -8,10 +8,11 @@ import subprocess import sys import warnings +from collections.abc import Callable from enum import Enum from functools import cache from logging import info -from typing import Any, Callable, Optional +from typing import Any, Optional from urllib.request import Request, urlopen import yaml diff --git a/.github/scripts/generate_pytorch_version.py b/.github/scripts/generate_pytorch_version.py index b35ccf6bcd38a..85be79c762e28 100755 --- a/.github/scripts/generate_pytorch_version.py +++ b/.github/scripts/generate_pytorch_version.py @@ -50,7 +50,7 @@ def get_tag() -> str: def get_base_version() -> str: root = get_pytorch_root() - dirty_version = open(root / "version.txt").read().strip() + dirty_version = Path(root / "version.txt").read_text().strip() # Strips trailing a0 from version.txt, not too sure why it's there in the # first place return re.sub(LEGACY_BASE_VERSION_SUFFIX_PATTERN, "", dirty_version) diff --git a/.github/scripts/get_workflow_job_id.py b/.github/scripts/get_workflow_job_id.py index b04cbed76e955..54e66621c9fd0 100644 --- a/.github/scripts/get_workflow_job_id.py +++ b/.github/scripts/get_workflow_job_id.py @@ -11,7 +11,8 @@ import time import urllib import urllib.parse -from typing import Any, Callable, Optional +from collections.abc import Callable +from typing import Any, Optional from urllib.request import Request, urlopen diff --git a/.github/scripts/github_utils.py b/.github/scripts/github_utils.py index 110015988a5c3..6479fb64ddbaf 100644 --- a/.github/scripts/github_utils.py +++ b/.github/scripts/github_utils.py @@ -3,8 +3,9 @@ import json import os import warnings +from collections.abc import Callable from dataclasses import dataclass -from typing import Any, Callable, cast, Optional, Union +from typing import Any, cast, Optional, Union from urllib.error import HTTPError from urllib.parse import quote from urllib.request import Request, urlopen diff --git a/.github/scripts/gitutils.py b/.github/scripts/gitutils.py index 3a90ddb5f4c6b..6e3bb3f209177 100644 --- a/.github/scripts/gitutils.py +++ b/.github/scripts/gitutils.py @@ -4,10 +4,10 @@ import re import tempfile from collections import defaultdict -from collections.abc import Iterator +from collections.abc import Callable, Iterator from datetime import datetime from functools import wraps -from typing import Any, Callable, cast, Optional, TypeVar, Union +from typing import Any, cast, Optional, TypeVar, Union T = TypeVar("T") diff --git a/.github/scripts/lintrunner.sh b/.github/scripts/lintrunner.sh index b353617a45b2b..58cda19cfeb43 100755 --- a/.github/scripts/lintrunner.sh +++ b/.github/scripts/lintrunner.sh @@ -34,6 +34,9 @@ python3 torch/utils/data/datapipes/gen_pyi.py # Also check generated pyi files find torch -name '*.pyi' -exec git add --force -- "{}" + +# Print current environment +python3 -m pip freeze + RC=0 # Run lintrunner on all files if ! lintrunner --force-color --tee-json=lint.json ${ADDITIONAL_LINTRUNNER_ARGS} 2> /dev/null; then diff --git a/.github/scripts/trymerge.py b/.github/scripts/trymerge.py index c258284a00d83..697ab6992793d 100755 --- a/.github/scripts/trymerge.py +++ b/.github/scripts/trymerge.py @@ -17,12 +17,12 @@ import time import urllib.parse from collections import defaultdict -from collections.abc import Iterable +from collections.abc import Callable, Iterable from dataclasses import dataclass from functools import cache from pathlib import Path from re import Pattern -from typing import Any, Callable, cast, NamedTuple, Optional +from typing import Any, cast, NamedTuple, Optional from warnings import warn import yaml diff --git a/.github/workflows/_binary-build-linux.yml b/.github/workflows/_binary-build-linux.yml index bfa035bc753b8..cb4cc738abaef 100644 --- a/.github/workflows/_binary-build-linux.yml +++ b/.github/workflows/_binary-build-linux.yml @@ -260,11 +260,8 @@ jobs: "${DOCKER_IMAGE}" ) docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" - if [[ ${BUILD_ENVIRONMENT} == *"aarch64"* ]]; then - docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /pytorch/.ci/aarch64_linux/aarch64_ci_build.sh" - else - docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /pytorch/.ci/${{ inputs.PACKAGE_TYPE }}/build.sh" - fi + # Unified build script for all architectures (x86_64, aarch64, s390x) + docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /pytorch/.ci/${{ inputs.PACKAGE_TYPE }}/build.sh" - name: Chown artifacts if: ${{ steps.filter.outputs.is-test-matrix-empty == 'False' && inputs.build_environment != 'linux-s390x-binary-manywheel' }} diff --git a/.github/workflows/_rocm-test.yml b/.github/workflows/_rocm-test.yml index 43ed76a63cc67..608aeba53e6d8 100644 --- a/.github/workflows/_rocm-test.yml +++ b/.github/workflows/_rocm-test.yml @@ -97,8 +97,8 @@ jobs: shell: bash run: | ngpu=$(rocminfo | grep -c -E 'Name:.*\sgfx') - if [[ $ngpu -lt 4 ]]; then - echo "Error: only $ngpu GPU(s) detected, at least 4 GPUs are needed for distributed jobs" + if [[ $ngpu -lt 2 ]]; then #We are temporarily reducing this down to 2 from 4 so that we can run tests on nodes with less gpus. + echo "Error: only $ngpu GPU(s) detected, at least 2 GPUs are needed for distributed jobs" exit 1 fi diff --git a/.github/workflows/_xpu-test.yml b/.github/workflows/_xpu-test.yml index e68bc6ead3a26..d27325b8a63dc 100644 --- a/.github/workflows/_xpu-test.yml +++ b/.github/workflows/_xpu-test.yml @@ -344,5 +344,21 @@ jobs: if-no-files-found: ignore path: ./**/core.[1-9]* + - name: Authenticate with AWS + uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0 + with: + role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_upload-benchmark-results + # The max duration enforced by the server side + role-duration-seconds: 18000 + aws-region: us-east-1 + + - name: Upload the benchmark results + uses: pytorch/test-infra/.github/actions/upload-benchmark-results@main + with: + benchmark-results-dir: test/test-reports + dry-run: false + schema-version: v3 + github-token: ${{ secrets.GITHUB_TOKEN }} + - name: Teardown XPU uses: ./.github/actions/teardown-xpu diff --git a/.github/workflows/attention_op_microbenchmark.yml b/.github/workflows/attention_op_microbenchmark.yml new file mode 100644 index 0000000000000..e01bc49621dcf --- /dev/null +++ b/.github/workflows/attention_op_microbenchmark.yml @@ -0,0 +1,73 @@ +name: attention_op_microbenchmark + +on: + push: + tags: + - ciflow/op-benchmark/* + workflow_dispatch: + schedule: + # Run at 06:00 UTC everyday + - cron: 0 7 * * * + +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }} + cancel-in-progress: true + +permissions: + id-token: write + contents: read + +jobs: + attn-microbenchmark-build: + if: github.repository_owner == 'pytorch' + uses: ./.github/workflows/_linux-build.yml + with: + runner: linux.12xlarge.memory + build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80 + docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11 + cuda-arch-list: '8.0 9.0' + test-matrix: | + { include: [ + { config: "attention_microbenchmark_test", shard: 1, num_shards: 1, runner: "linux.aws.a100" }, + { config: "attention_microbenchmark_test", shard: 1, num_shards: 1, runner: "linux.aws.h100" }, + ]} + secrets: inherit + + attn-microbenchmark-test: + name: attn-microbenchmark-test + uses: ./.github/workflows/_linux-test.yml + needs: attn-microbenchmark-build + with: + timeout-minutes: 500 + build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80 + docker-image: ${{ needs.attn-microbenchmark-build.outputs.docker-image }} + test-matrix: ${{ needs.attn-microbenchmark-build.outputs.test-matrix }} + secrets: inherit + + # B200 runner + opmicrobenchmark-build-b200: + if: github.repository_owner == 'pytorch' + name: opmicrobenchmark-build-b200 + uses: ./.github/workflows/_linux-build.yml + with: + runner: linux.12xlarge.memory + build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm100 + docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11 + cuda-arch-list: '10.0' + test-matrix: | + { include: [ + { config: "operator_microbenchmark_test", shard: 1, num_shards: 1, runner: "linux.dgx.b200" }, + ]} + secrets: inherit + + opmicrobenchmark-test-b200: + name: opmicrobenchmark-test-b200 + uses: ./.github/workflows/_linux-test.yml + needs: opmicrobenchmark-build-b200 + with: + timeout-minutes: 500 + build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm100 + docker-image: ${{ needs.opmicrobenchmark-build-b200.outputs.docker-image }} + test-matrix: ${{ needs.opmicrobenchmark-build-b200.outputs.test-matrix }} + aws-role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only + secrets: inherit diff --git a/.github/workflows/docker-builds.yml b/.github/workflows/docker-builds.yml index 6fbe2e846d40b..408a8f0000504 100644 --- a/.github/workflows/docker-builds.yml +++ b/.github/workflows/docker-builds.yml @@ -56,6 +56,8 @@ jobs: pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9, pytorch-linux-jammy-cuda12.4-cudnn9-py3-gcc11, pytorch-linux-jammy-py3.10-clang12, + pytorch-linux-jammy-py3.11-clang12, + pytorch-linux-jammy-py3.12-clang12, pytorch-linux-jammy-py3.13-clang12, pytorch-linux-jammy-py3.14-clang12, pytorch-linux-jammy-rocm-n-py3, @@ -65,9 +67,10 @@ jobs: pytorch-linux-jammy-py3.10-gcc11, pytorch-linux-jammy-py3-gcc11-inductor-benchmarks, pytorch-linux-jammy-py3.12-halide, + pytorch-linux-jammy-cuda12.8-py3.12-pallas, pytorch-linux-jammy-xpu-n-1-py3, - pytorch-linux-jammy-xpu-n-py3, - pytorch-linux-jammy-xpu-n-py3-inductor-benchmarks, + pytorch-linux-noble-xpu-n-py3, + pytorch-linux-noble-xpu-n-py3-inductor-benchmarks, pytorch-linux-jammy-py3-clang18-asan, pytorch-linux-jammy-py3-clang12-onnx, pytorch-linux-jammy-linter, @@ -77,9 +80,11 @@ jobs: pytorch-linux-noble-riscv64-py3.12-gcc14 ] include: - - docker-image-name: pytorch-linux-jammy-aarch64-py3.10-gcc11 + - docker-image-name: pytorch-linux-jammy-aarch64-py3.10-gcc13 runner: linux.arm64.m7g.4xlarge - - docker-image-name: pytorch-linux-jammy-aarch64-py3.10-gcc11-inductor-benchmarks + - docker-image-name: pytorch-linux-jammy-aarch64-py3.10-clang21 + runner: linux.arm64.m7g.4xlarge + - docker-image-name: pytorch-linux-jammy-aarch64-py3.10-gcc13-inductor-benchmarks runner: linux.arm64.m7g.4xlarge timeout-minutes: 600 # Docker uploads fail from LF runners, see https://github.com/pytorch/pytorch/pull/137358 @@ -114,6 +119,22 @@ jobs: with: docker-image: ${{ steps.build-docker-image.outputs.docker-image }} + - name: Generate output + if: contains(matrix.docker-image-name, 'rocm') + id: generate_output + run: | + docker_image_name="${{ matrix.docker-image-name }}" + docker_image_tag="${{ steps.build-docker-image.outputs.docker-image }}" + echo "${docker_image_name}=${docker_image_tag}" >> docker-builds-output-${docker_image_name}.txt + + - name: Upload artifacts + uses: actions/upload-artifact@v4.4.0 + if: contains(matrix.docker-image-name, 'rocm') + with: + name: docker-builds-artifacts-${{ matrix.docker-image-name }} + retention-days: 14 + path: ./docker-builds-output-${{ matrix.docker-image-name }}.txt + - uses: nick-fields/retry@7152eba30c6575329ac0576536151aca5a72780e # v3.0.0 name: Push to https://ghcr.io/ id: push-to-ghcr-io diff --git a/.github/workflows/docker-cache-mi300.yml b/.github/workflows/docker-cache-mi300.yml deleted file mode 100644 index 02c1171c567aa..0000000000000 --- a/.github/workflows/docker-cache-mi300.yml +++ /dev/null @@ -1,55 +0,0 @@ -name: docker-cache-mi300 - -on: - # run every 6 hours - schedule: - - cron: 0 0,6,12,18 * * * - workflow_dispatch: - -concurrency: - group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name }} - cancel-in-progress: true - -permissions: - id-token: write - contents: read - -jobs: - docker-cache: - if: github.repository_owner == 'pytorch' - runs-on: rocm-docker - steps: - - name: Checkout PyTorch - uses: pytorch/pytorch/.github/actions/checkout-pytorch@main - with: - no-sudo: true - - - name: configure aws credentials - id: aws_creds - uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0 - with: - role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only - aws-region: us-east-1 - role-duration-seconds: 18000 - - - name: Login to Amazon ECR - id: login-ecr - continue-on-error: false - uses: aws-actions/amazon-ecr-login@062b18b96a7aff071d4dc91bc00c4c1a7945b076 # v2.0.1 - - - name: Calculate docker image - id: calculate-docker-image - uses: pytorch/test-infra/.github/actions/calculate-docker-image@main - with: - docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3 - push: false - - - name: Pull docker image - uses: pytorch/test-infra/.github/actions/pull-docker-image@main - with: - docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} - - - name: Tar and upload to S3 bucket - run: | - sudo docker save -o ~/docker-data/pytorch/pytorch_docker_image.tar ${{ steps.calculate-docker-image.outputs.docker-image }} - sudo rclone copy -P --s3-upload-concurrency 64 --s3-chunk-size 200M --s3-upload-cutoff 300M ~/docker-data/pytorch/pytorch_docker_image.tar oci:pytorchbucket0002/pytorch_docker_image --progress diff --git a/.github/workflows/docker-cache-rocm.yml b/.github/workflows/docker-cache-rocm.yml new file mode 100644 index 0000000000000..78d38de3ac69a --- /dev/null +++ b/.github/workflows/docker-cache-rocm.yml @@ -0,0 +1,105 @@ +name: docker-cache-rocm + +on: + workflow_run: + workflows: [docker-builds] + branches: [main, release] + types: + - completed + workflow_dispatch: + +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name }} + cancel-in-progress: true + +permissions: + id-token: write + contents: read + actions: read + +jobs: + download-docker-builds-artifacts: + if: github.repository_owner == 'pytorch' + name: download-docker-builds-artifacts + runs-on: ubuntu-latest + outputs: + pytorch-linux-jammy-rocm-n-py3: ${{ steps.process-artifacts.outputs.pytorch-linux-jammy-rocm-n-py3 }} + pytorch-linux-noble-rocm-n-py3: ${{ steps.process-artifacts.outputs.pytorch-linux-noble-rocm-n-py3 }} + pytorch-linux-jammy-rocm-n-py3-benchmarks: ${{ steps.process-artifacts.outputs.pytorch-linux-jammy-rocm-n-py3-benchmarks }} + steps: + - name: Download artifacts + uses: actions/download-artifact@v4.1.7 + with: + run-id: ${{ github.event.workflow_run.id }} + path: ./docker-builds-artifacts + merge-multiple: true + github-token: ${{ secrets.GITHUB_TOKEN }} + + - name: Process artifacts + id: process-artifacts + run: | + ls -R ./docker-builds-artifacts + cat ./docker-builds-artifacts/*txt >> "${GITHUB_OUTPUT}" + cat "${GITHUB_OUTPUT}" + + docker-cache: + if: github.repository_owner == 'pytorch' + needs: download-docker-builds-artifacts + strategy: + fail-fast: false + matrix: + runner: [linux.rocm.gfx942.docker-cache] + docker-image: [ + "${{ needs.download-docker-builds-artifacts.outputs.pytorch-linux-jammy-rocm-n-py3 }}", + "${{ needs.download-docker-builds-artifacts.outputs.pytorch-linux-noble-rocm-n-py3 }}", + "${{ needs.download-docker-builds-artifacts.outputs.pytorch-linux-jammy-rocm-n-py3-benchmarks }}" + ] + runs-on: "${{ matrix.runner }}" + steps: + - name: debug + run: | + JSON_STRINGIFIED="${{ toJSON(needs.download-docker-builds-artifacts.outputs) }}" + echo "Outputs of download-docker-builds-artifacts job: ${JSON_STRINGIFIED}" + + - name: configure aws credentials + id: aws_creds + uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0 + with: + role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only + aws-region: us-east-1 + role-duration-seconds: 18000 + + - name: Login to Amazon ECR + id: login-ecr + continue-on-error: false + uses: aws-actions/amazon-ecr-login@062b18b96a7aff071d4dc91bc00c4c1a7945b076 # v2.0.1 + + - name: Generate ghrc.io tag + id: ghcr-io-tag + run: | + ecr_image="${{ matrix.docker-image }}" + ghcr_image="ghcr.io/pytorch/ci-image:${ecr_image##*:}" + echo "ghcr_image=${ghcr_image}" >> "$GITHUB_OUTPUT" + + - name: Pull docker image + uses: pytorch/test-infra/.github/actions/pull-docker-image@main + with: + docker-image: ${{ steps.ghcr-io-tag.outputs.ghcr_image }} + + - name: Save as tarball + run: | + docker_image_tag=${{ matrix.docker-image }} + docker_image_tag="${docker_image_tag#*:}" # Remove everything before and including first ":" + docker_image_tag="${docker_image_tag%-*}" # Remove everything after and including last "-" + ref_name=${{ github.event.workflow_run.head_branch }} + if [[ $ref_name =~ "release/" ]]; then + ref_suffix="release" + elif [[ $ref_name == "main" ]]; then + ref_suffix="main" + else + echo "Unexpected branch in ref_name: ${ref_name}" && exit 1 + fi + docker tag ${{ steps.ghcr-io-tag.outputs.ghcr_image }} ${{ matrix.docker-image }} + # mv is atomic operation, so we use intermediate tar.tmp file to prevent read-write contention + docker save -o ~/pytorch-data/docker/${docker_image_tag}.tar.tmp ${{ matrix.docker-image }} + mv ~/pytorch-data/docker/${docker_image_tag}.tar.tmp ~/pytorch-data/docker/${docker_image_tag}_${ref_suffix}.tar diff --git a/.github/workflows/h100-distributed.yml b/.github/workflows/h100-distributed.yml index be19b8f961f4d..c05b61e30a635 100644 --- a/.github/workflows/h100-distributed.yml +++ b/.github/workflows/h100-distributed.yml @@ -37,7 +37,6 @@ jobs: needs: get-label-type with: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - runner: "linux.c7i.12xlarge" build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm90-dist docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11 cuda-arch-list: '9.0' diff --git a/.github/workflows/inductor-perf-test-nightly-aarch64.yml b/.github/workflows/inductor-perf-test-nightly-aarch64.yml index e16c8be79130d..46a1966570c63 100644 --- a/.github/workflows/inductor-perf-test-nightly-aarch64.yml +++ b/.github/workflows/inductor-perf-test-nightly-aarch64.yml @@ -72,7 +72,7 @@ jobs: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" runner: linux.arm64.m7g.4xlarge build-environment: linux-jammy-aarch64-py3.10 - docker-image-name: ci-image:pytorch-linux-jammy-aarch64-py3.10-gcc11-inductor-benchmarks + docker-image-name: ci-image:pytorch-linux-jammy-aarch64-py3.10-gcc13-inductor-benchmarks test-matrix: | { include: [ { config: "inductor_huggingface_perf_cpu_aarch64", shard: 1, num_shards: 9, runner: "linux.arm64.m7g.metal" }, diff --git a/.github/workflows/inductor-perf-test-nightly-xpu.yml b/.github/workflows/inductor-perf-test-nightly-xpu.yml index c2db8c310e368..28b10996bf38a 100644 --- a/.github/workflows/inductor-perf-test-nightly-xpu.yml +++ b/.github/workflows/inductor-perf-test-nightly-xpu.yml @@ -83,8 +83,8 @@ jobs: needs: get-label-type with: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - build-environment: linux-jammy-xpu-n-py3.10 - docker-image-name: ci-image:pytorch-linux-jammy-xpu-n-py3-inductor-benchmarks + build-environment: linux-noble-xpu-n-py3.10 + docker-image-name: ci-image:pytorch-linux-noble-xpu-n-py3-inductor-benchmarks runner: linux.c7i.12xlarge test-matrix: | { include: [ @@ -117,7 +117,7 @@ jobs: uses: ./.github/workflows/_xpu-test.yml needs: xpu-n-py3_10-inductor-benchmark-build with: - build-environment: linux-jammy-xpu-n-py3.10 + build-environment: linux-noble-xpu-n-py3.10 dashboard-tag: training-true-inference-true-default-true-dynamic-true-cudagraphs-false-cppwrapper-true-aotinductor-true-freezing_cudagraphs-false-cudagraphs_low_precision-false docker-image: ${{ needs.xpu-n-py3_10-inductor-benchmark-build.outputs.docker-image }} test-matrix: ${{ needs.xpu-n-py3_10-inductor-benchmark-build.outputs.test-matrix }} @@ -137,7 +137,7 @@ jobs: uses: ./.github/workflows/_xpu-test.yml needs: xpu-n-py3_10-inductor-benchmark-build with: - build-environment: linux-jammy-xpu-n-py3.10 + build-environment: linux-noble-xpu-n-py3.10 dashboard-tag: training-${{ inputs.training }}-inference-${{ inputs.inference }}-default-${{ inputs.default }}-dynamic-${{ inputs.dynamic }}-cudagraphs-${{ inputs.cudagraphs }}-cppwrapper-${{ inputs.cppwrapper }}-aotinductor-${{ inputs.aotinductor }}-maxautotune-${{ inputs.maxautotune }}-freezing_cudagraphs-${{ inputs.freezing_cudagraphs }}-cudagraphs_low_precision-${{ inputs.cudagraphs }} docker-image: ${{ needs.xpu-n-py3_10-inductor-benchmark-build.outputs.docker-image }} test-matrix: ${{ needs.xpu-n-py3_10-inductor-benchmark-build.outputs.test-matrix }} diff --git a/.github/workflows/inductor-rocm.yml b/.github/workflows/inductor-rocm-mi200.yml similarity index 95% rename from .github/workflows/inductor-rocm.yml rename to .github/workflows/inductor-rocm-mi200.yml index b2ff53a645481..55de9a2121cf6 100644 --- a/.github/workflows/inductor-rocm.yml +++ b/.github/workflows/inductor-rocm-mi200.yml @@ -1,13 +1,13 @@ -name: inductor-rocm +name: inductor-rocm-mi200 on: schedule: - - cron: 0 * * * * + - cron: 0 */3 * * * push: branches: - release/* tags: - - ciflow/inductor-rocm/* + - ciflow/inductor-rocm-mi200/* workflow_dispatch: concurrency: diff --git a/.github/workflows/inductor-rocm-mi300.yml b/.github/workflows/inductor-rocm-mi300.yml index 732ec7eb85f3e..dee10a0db3c16 100644 --- a/.github/workflows/inductor-rocm-mi300.yml +++ b/.github/workflows/inductor-rocm-mi300.yml @@ -7,6 +7,7 @@ on: - release/* tags: - ciflow/inductor-rocm/* + - ciflow/inductor-rocm-mi300/* workflow_dispatch: concurrency: diff --git a/.github/workflows/inductor-unittest.yml b/.github/workflows/inductor-unittest.yml index 6ab276a57fc4d..ca9b57cab2ddb 100644 --- a/.github/workflows/inductor-unittest.yml +++ b/.github/workflows/inductor-unittest.yml @@ -81,6 +81,32 @@ jobs: test-matrix: ${{ needs.inductor-halide-build.outputs.test-matrix }} secrets: inherit + inductor-pallas-build: + name: inductor-pallas-build + uses: ./.github/workflows/_linux-build.yml + needs: get-label-type + with: + build-environment: linux-jammy-cuda12.8-py3.12-gcc11 + docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-py3.12-pallas + cuda-arch-list: '8.9' + runner: linux.8xlarge.memory + runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + test-matrix: | + { include: [ + { config: "inductor-pallas", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.12xlarge.nvidia.gpu" }, + ]} + secrets: inherit + + inductor-pallas-test: + name: inductor-pallas-test + uses: ./.github/workflows/_linux-test.yml + needs: inductor-pallas-build + with: + build-environment: linux-jammy-py3.12-gcc11 + docker-image: ${{ needs.inductor-pallas-build.outputs.docker-image }} + test-matrix: ${{ needs.inductor-pallas-build.outputs.test-matrix }} + secrets: inherit + inductor-triton-cpu-build: name: inductor-triton-cpu-build uses: ./.github/workflows/_linux-build.yml @@ -115,10 +141,10 @@ jobs: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" test-matrix: | { include: [ - { config: "inductor_amx", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.8xlarge.amx" }, - { config: "inductor_amx", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.8xlarge.amx" }, - { config: "inductor_avx2", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.10xlarge.avx2" }, - { config: "inductor_avx2", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.10xlarge.avx2" }, + { config: "inductor_amx", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge.amx" }, + { config: "inductor_amx", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge.amx" }, + { config: "inductor_avx2", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge.avx2" }, + { config: "inductor_avx2", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge.avx2" }, ]} secrets: inherit diff --git a/.github/workflows/inductor.yml b/.github/workflows/inductor.yml index 2616141c0dc2a..8a913c3b36a11 100644 --- a/.github/workflows/inductor.yml +++ b/.github/workflows/inductor.yml @@ -84,13 +84,13 @@ jobs: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" test-matrix: | { include: [ - { config: "cpu_inductor_torchbench", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.8xlarge.amx" }, - { config: "cpu_inductor_torchbench", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.8xlarge.amx" }, - { config: "dynamic_cpu_inductor_huggingface", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.8xlarge.amx" }, - { config: "dynamic_cpu_inductor_timm", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.8xlarge.amx" }, - { config: "dynamic_cpu_inductor_timm", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.8xlarge.amx" }, - { config: "dynamic_cpu_inductor_torchbench", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.8xlarge.amx" }, - { config: "dynamic_cpu_inductor_torchbench", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.8xlarge.amx" }, + { config: "cpu_inductor_torchbench", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge.amx" }, + { config: "cpu_inductor_torchbench", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge.amx" }, + { config: "dynamic_cpu_inductor_huggingface", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge.amx" }, + { config: "dynamic_cpu_inductor_timm", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge.amx" }, + { config: "dynamic_cpu_inductor_timm", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge.amx" }, + { config: "dynamic_cpu_inductor_torchbench", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge.amx" }, + { config: "dynamic_cpu_inductor_torchbench", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge.amx" }, { config: "inductor_torchbench_cpu_smoketest_perf", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.24xl.spr-metal" }, ]} build-additional-packages: "vision audio torchao" diff --git a/.github/workflows/linux-aarch64.yml b/.github/workflows/linux-aarch64.yml index 2b840a39a5c21..e6690b1043006 100644 --- a/.github/workflows/linux-aarch64.yml +++ b/.github/workflows/linux-aarch64.yml @@ -33,7 +33,7 @@ jobs: with: runner_prefix: ${{ needs.get-label-type.outputs.label-type }} build-environment: linux-jammy-aarch64-py3.10 - docker-image-name: ci-image:pytorch-linux-jammy-aarch64-py3.10-gcc11 + docker-image-name: ci-image:pytorch-linux-jammy-aarch64-py3.10-gcc13 runner: linux.arm64.m7g.4xlarge test-matrix: | { include: [ diff --git a/.github/workflows/nightly.yml b/.github/workflows/nightly.yml index 0682dd2144afd..c47b0c5763078 100644 --- a/.github/workflows/nightly.yml +++ b/.github/workflows/nightly.yml @@ -5,9 +5,11 @@ on: - cron: 0 0 * * * push: tags: - # NOTE: Doc build pipelines should only get triggered on release candidate builds - # Release candidate tags look like: v1.11.0-rc1 - - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+ + # NOTE: Doc build pipelines should only get triggered on: + # Major or minor release candidates builds + - v[0-9]+.[0-9]+.0+-rc[0-9]+ + # Final RC for major, minor and patch releases + - v[0-9]+.[0-9]+.[0-9]+ - ciflow/nightly/* workflow_dispatch: diff --git a/.github/workflows/operator_benchmark.yml b/.github/workflows/operator_benchmark.yml index 40fb3b8d0c85f..758147f5fe18e 100644 --- a/.github/workflows/operator_benchmark.yml +++ b/.github/workflows/operator_benchmark.yml @@ -60,7 +60,7 @@ jobs: with: build-environment: linux-jammy-aarch64-py3.10 runner: linux.arm64.m7g.4xlarge - docker-image-name: ci-image:pytorch-linux-jammy-aarch64-py3.10-gcc11 + docker-image-name: ci-image:pytorch-linux-jammy-aarch64-py3.10-gcc13 test-matrix: | { include: [ { config: "cpu_operator_benchmark_short", shard: 1, num_shards: 1, runner: "linux.arm64.m8g.4xlarge" }, diff --git a/.github/workflows/periodic-rocm-mi200.yml b/.github/workflows/periodic-rocm-mi200.yml index 6b65bf05cbde0..18e7b60570bf8 100644 --- a/.github/workflows/periodic-rocm-mi200.yml +++ b/.github/workflows/periodic-rocm-mi200.yml @@ -11,7 +11,6 @@ on: - cron: 29 8 * * * # about 1:29am PDT, for mem leak check and rerun disabled tests push: tags: - - ciflow/periodic/* - ciflow/periodic-rocm-mi200/* branches: - release/* diff --git a/.github/workflows/periodic-rocm-mi300.yml b/.github/workflows/periodic-rocm-mi300.yml index 4d8890e69fc73..ce68ee8bc8e03 100644 --- a/.github/workflows/periodic-rocm-mi300.yml +++ b/.github/workflows/periodic-rocm-mi300.yml @@ -11,6 +11,7 @@ on: - cron: 29 8 * * * # about 1:29am PDT, for mem leak check and rerun disabled tests push: tags: + - ciflow/periodic/* - ciflow/periodic-rocm-mi300/* branches: - release/* diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml index e3af55e736503..e5fd10c70db61 100644 --- a/.github/workflows/pull.yml +++ b/.github/workflows/pull.yml @@ -342,16 +342,16 @@ jobs: test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-build.outputs.test-matrix }} secrets: inherit - linux-jammy-xpu-n-py3_10-build: - name: linux-jammy-xpu-n-py3.10 + linux-noble-xpu-n-py3_10-build: + name: linux-noble-xpu-n-py3.10 uses: ./.github/workflows/_linux-build.yml needs: get-label-type with: # This should sync with the build in xpu.yml but xpu uses a larger runner # sync-tag: linux-xpu-n-build runner_prefix: ${{ needs.get-label-type.outputs.label-type }} - build-environment: linux-jammy-xpu-n-py3.10 - docker-image-name: ci-image:pytorch-linux-jammy-xpu-n-py3 + build-environment: linux-noble-xpu-n-py3.10 + docker-image-name: ci-image:pytorch-linux-noble-xpu-n-py3 test-matrix: | { include: [ { config: "default", shard: 1, num_shards: 4, runner: "linux.idc.xpu" }, diff --git a/.github/workflows/rocm.yml b/.github/workflows/rocm-mi200.yml similarity index 97% rename from .github/workflows/rocm.yml rename to .github/workflows/rocm-mi200.yml index ffe6efbe0433c..c947e361bfcb5 100644 --- a/.github/workflows/rocm.yml +++ b/.github/workflows/rocm-mi200.yml @@ -1,15 +1,16 @@ -name: rocm +name: rocm-mi200 on: push: branches: - release/* tags: - - ciflow/rocm/* + - ciflow/rocm-mi200/* workflow_dispatch: schedule: - cron: 29 8 * * * # about 1:29am PDT - - cron: 0 * * * * + - cron: 0 */3 * * * + concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }} diff --git a/.github/workflows/rocm-mi300.yml b/.github/workflows/rocm-mi300.yml index c50111d068d24..d20b37be20876 100644 --- a/.github/workflows/rocm-mi300.yml +++ b/.github/workflows/rocm-mi300.yml @@ -6,6 +6,7 @@ on: - main - release/* tags: + - ciflow/rocm/* - ciflow/rocm-mi300/* workflow_dispatch: schedule: diff --git a/.github/workflows/slow-rocm-mi200.yml b/.github/workflows/slow-rocm-mi200.yml new file mode 100644 index 0000000000000..c564857dca9ce --- /dev/null +++ b/.github/workflows/slow-rocm-mi200.yml @@ -0,0 +1,81 @@ +# This workflow is dedicated to host slow jobs that are run only periodically because +# they are too slow to run in every commit. The list of slow tests can be found in +# https://github.com/pytorch/test-infra/blob/generated-stats/stats/slow-tests.json +name: slow-rocm-mi200 + +on: + push: + branches: + - release/* + tags: + - ciflow/slow/* + - ciflow/slow-rocm-mi200/* + schedule: + - cron: 0 */3 * * * + workflow_dispatch: + +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}-${{ github.event.schedule }} + cancel-in-progress: true + +permissions: + id-token: write + contents: read + +jobs: + llm-td: + if: github.repository_owner == 'pytorch' + name: before-test + uses: ./.github/workflows/llm_td_retrieval.yml + permissions: + id-token: write + contents: read + + target-determination: + name: before-test + uses: ./.github/workflows/target_determination.yml + needs: llm-td + permissions: + id-token: write + contents: read + + get-label-type: + name: get-label-type + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main + if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }} + with: + triggering_actor: ${{ github.triggering_actor }} + issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} + curr_branch: ${{ github.head_ref || github.ref_name }} + curr_ref_type: ${{ github.ref_type }} + + linux-jammy-rocm-py3_10-build: + name: linux-jammy-rocm-py3.10 + uses: ./.github/workflows/_linux-build.yml + needs: get-label-type + with: + runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + build-environment: linux-jammy-rocm-py3.10 + docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3 + sync-tag: rocm-build + test-matrix: | + { include: [ + { config: "slow", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.2", owners: ["module:rocm"] }, + { config: "slow", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.2", owners: ["module:rocm"] }, + ]} + secrets: inherit + + linux-jammy-rocm-py3_10-test: + permissions: + id-token: write + contents: read + name: linux-jammy-rocm-py3.10 + uses: ./.github/workflows/_rocm-test.yml + needs: + - linux-jammy-rocm-py3_10-build + - target-determination + with: + build-environment: linux-jammy-rocm-py3.10 + docker-image: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.docker-image }} + test-matrix: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.test-matrix }} + secrets: inherit diff --git a/.github/workflows/slow.yml b/.github/workflows/slow.yml index d4992a2ddb2cf..c14caee9a336c 100644 --- a/.github/workflows/slow.yml +++ b/.github/workflows/slow.yml @@ -105,36 +105,6 @@ jobs: test-matrix: ${{ needs.linux-jammy-py3_10-clang12-build.outputs.test-matrix }} secrets: inherit - linux-jammy-rocm-py3_10-build: - name: linux-jammy-rocm-py3.10 - uses: ./.github/workflows/_linux-build.yml - needs: get-label-type - with: - runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - build-environment: linux-jammy-rocm-py3.10 - docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3 - test-matrix: | - { include: [ - { config: "slow", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.2", owners: ["module:rocm"] }, - { config: "slow", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.2", owners: ["module:rocm"] }, - ]} - secrets: inherit - - linux-jammy-rocm-py3_10-test: - permissions: - id-token: write - contents: read - name: linux-jammy-rocm-py3.10 - uses: ./.github/workflows/_rocm-test.yml - needs: - - linux-jammy-rocm-py3_10-build - - target-determination - with: - build-environment: linux-jammy-rocm-py3.10 - docker-image: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.docker-image }} - test-matrix: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.test-matrix }} - secrets: inherit - linux-jammy-py3_10-clang18-asan-build: name: linux-jammy-py3.10-clang18-asan uses: ./.github/workflows/_linux-build.yml diff --git a/.github/workflows/test-b200.yml b/.github/workflows/test-b200.yml index ef7f75bc4b2b4..07fd9b18fdada 100644 --- a/.github/workflows/test-b200.yml +++ b/.github/workflows/test-b200.yml @@ -5,7 +5,9 @@ # Flow: # 1. Builds PyTorch with CUDA 12.8+ and sm100 architecture for B200 # 2. Runs smoke tests on linux.dgx.b200 runner -# 3. Tests executed are defined in .ci/pytorch/test.sh -> test_python_smoke() function +# 3. Tests executed are defined in .ci/pytorch/test.sh -> test_python_smoke_b200() function +# - Includes matmul, scaled_matmul, FP8, and FlashAttention CuTe tests +# - FlashAttention CuTe DSL is installed as part of test execution # # Triggered by: # - Pull requests modifying this workflow file diff --git a/.github/workflows/test-h100.yml b/.github/workflows/test-h100.yml index ec99f4473bb0b..510473d5306ad 100644 --- a/.github/workflows/test-h100.yml +++ b/.github/workflows/test-h100.yml @@ -41,7 +41,6 @@ jobs: needs: get-label-type with: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - runner: linux.12xlarge.memory build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm90 docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11 cuda-arch-list: '9.0' diff --git a/.github/workflows/trunk-rocm-mi300.yml b/.github/workflows/trunk-rocm-mi300.yml new file mode 100644 index 0000000000000..23ab5e9260a3e --- /dev/null +++ b/.github/workflows/trunk-rocm-mi300.yml @@ -0,0 +1,83 @@ +name: trunk-rocm-mi300 + +on: + push: + branches: + - main + - release/* + workflow_dispatch: + schedule: + - cron: 29 8 * * * # about 1:29am PDT + +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }} + cancel-in-progress: true + +permissions: + id-token: write + contents: read + +jobs: + llm-td: + if: github.repository_owner == 'pytorch' + name: before-test + uses: ./.github/workflows/llm_td_retrieval.yml + permissions: + id-token: write + contents: read + + target-determination: + name: before-test + uses: ./.github/workflows/target_determination.yml + needs: llm-td + permissions: + id-token: write + contents: read + + get-label-type: + name: get-label-type + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main + if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }} + with: + triggering_actor: ${{ github.triggering_actor }} + issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} + curr_branch: ${{ github.head_ref || github.ref_name }} + curr_ref_type: ${{ github.ref_type }} + + linux-jammy-rocm-py3_10-build: + name: linux-jammy-rocm-py3.10 + uses: ./.github/workflows/_linux-build.yml + needs: get-label-type + with: + runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + build-environment: linux-jammy-rocm-py3.10 + docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3 + sync-tag: rocm-build + test-matrix: | + { include: [ + { config: "default", shard: 1, num_shards: 6, runner: "linux.rocm.gpu.gfx942.1.b" }, + { config: "default", shard: 2, num_shards: 6, runner: "linux.rocm.gpu.gfx942.1.b" }, + { config: "default", shard: 3, num_shards: 6, runner: "linux.rocm.gpu.gfx942.1.b" }, + { config: "default", shard: 4, num_shards: 6, runner: "linux.rocm.gpu.gfx942.1.b" }, + { config: "default", shard: 5, num_shards: 6, runner: "linux.rocm.gpu.gfx942.1.b" }, + { config: "default", shard: 6, num_shards: 6, runner: "linux.rocm.gpu.gfx942.1.b" }, + { config: "distributed", shard: 1, num_shards: 3, runner: "linux.rocm.gpu.gfx942.4.b" }, + { config: "distributed", shard: 2, num_shards: 3, runner: "linux.rocm.gpu.gfx942.4.b" }, + { config: "distributed", shard: 3, num_shards: 3, runner: "linux.rocm.gpu.gfx942.4.b" }, + ]} + secrets: inherit + + linux-jammy-rocm-py3_10-test: + permissions: + id-token: write + contents: read + name: linux-jammy-rocm-py3.10 + uses: ./.github/workflows/_rocm-test.yml + needs: + - linux-jammy-rocm-py3_10-build + - target-determination + with: + build-environment: linux-jammy-rocm-py3.10 + docker-image: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.docker-image }} + test-matrix: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.test-matrix }} + secrets: inherit diff --git a/.github/workflows/upload-test-stats.yml b/.github/workflows/upload-test-stats.yml index 24c3ab3db84f3..b3d8073aad3b3 100644 --- a/.github/workflows/upload-test-stats.yml +++ b/.github/workflows/upload-test-stats.yml @@ -5,21 +5,23 @@ on: workflows: - pull - trunk + - trunk-rocm-mi300 - periodic - periodic-rocm-mi200 - periodic-rocm-mi300 - inductor - unstable - slow + - slow-rocm-mi200 - unstable-periodic - inductor-periodic - - rocm + - rocm-mi200 - rocm-mi300 - rocm-mi355 - inductor-micro-benchmark - inductor-micro-benchmark-x86 - inductor-cu124 - - inductor-rocm + - inductor-rocm-mi200 - inductor-rocm-mi300 - mac-mps - linux-aarch64 diff --git a/.github/workflows/xpu.yml b/.github/workflows/xpu.yml index 36f603f70fde7..d9a1ba13d2b59 100644 --- a/.github/workflows/xpu.yml +++ b/.github/workflows/xpu.yml @@ -47,15 +47,15 @@ jobs: ]} secrets: inherit - linux-jammy-xpu-n-py3_10-build: - name: linux-jammy-xpu-n-py3.10 + linux-noble-xpu-n-py3_10-build: + name: linux-noble-xpu-n-py3.10 uses: ./.github/workflows/_linux-build.yml needs: get-label-type with: sync-tag: linux-xpu-n-build runner_prefix: ${{ needs.get-label-type.outputs.label-type }} - build-environment: linux-jammy-xpu-n-py3.10 - docker-image-name: ci-image:pytorch-linux-jammy-xpu-n-py3 + build-environment: linux-noble-xpu-n-py3.10 + docker-image-name: ci-image:pytorch-linux-noble-xpu-n-py3 runner: linux.c7i.12xlarge test-matrix: | { include: [ @@ -74,17 +74,17 @@ jobs: ]} secrets: inherit - linux-jammy-xpu-n-py3_10-test: - name: linux-jammy-xpu-n-py3.10 + linux-noble-xpu-n-py3_10-test: + name: linux-noble-xpu-n-py3.10 uses: ./.github/workflows/_xpu-test.yml - needs: linux-jammy-xpu-n-py3_10-build + needs: linux-noble-xpu-n-py3_10-build permissions: id-token: write contents: read with: - build-environment: linux-jammy-xpu-n-py3.10 - docker-image: ${{ needs.linux-jammy-xpu-n-py3_10-build.outputs.docker-image }} - test-matrix: ${{ needs.linux-jammy-xpu-n-py3_10-build.outputs.test-matrix }} + build-environment: linux-noble-xpu-n-py3.10 + docker-image: ${{ needs.linux-noble-xpu-n-py3_10-build.outputs.docker-image }} + test-matrix: ${{ needs.linux-noble-xpu-n-py3_10-build.outputs.test-matrix }} secrets: inherit windows-xpu-n-1-build: diff --git a/.lintrunner.toml b/.lintrunner.toml index cee0249ad96eb..7a6e241f90c8d 100644 --- a/.lintrunner.toml +++ b/.lintrunner.toml @@ -143,7 +143,8 @@ init_command = [ 'tools/linter/adapters/pip_init.py', '--dry-run={{DRYRUN}}', 'numpy==1.26.4 ; python_version >= "3.10" and python_version <= "3.11"', - 'numpy==2.1.0 ; python_version >= "3.12"', + 'numpy==2.1.0 ; python_version >= "3.12" and python_version <= "3.13"', + 'numpy==2.3.4 ; python_version >= "3.14"', 'expecttest==0.3.0', 'pyrefly==0.36.2', 'sympy==1.13.3', @@ -185,6 +186,8 @@ include_patterns = [ 'aten/src/ATen/native/nested/cuda/*.h', 'aten/src/ATen/native/nested/*.cpp', 'aten/src/ATen/native/nested/*.h', + 'aten/src/ATen/xpu/**/*.h', + 'aten/src/ATen/xpu/**/*.cpp', 'c10/**/*.cpp', 'c10/**/*.h', 'torch/*.h', @@ -1401,7 +1404,7 @@ init_command = [ '--dry-run={{DRYRUN}}', 'usort==1.0.8.post1', 'isort==6.0.1', - 'ruff==0.13.1', # sync with RUFF + 'ruff==0.14.4', # sync with RUFF ] is_formatter = true @@ -1536,7 +1539,7 @@ init_command = [ 'python3', 'tools/linter/adapters/pip_init.py', '--dry-run={{DRYRUN}}', - 'ruff==0.13.1', # sync with PYFMT + 'ruff==0.14.4', # sync with PYFMT ] is_formatter = true diff --git a/.spin/cmds.py b/.spin/cmds.py new file mode 100644 index 0000000000000..a81717c7423be --- /dev/null +++ b/.spin/cmds.py @@ -0,0 +1,330 @@ +import hashlib +import subprocess +import sys +from pathlib import Path + +import click +import spin + + +def file_digest(file, algorithm: str): + try: + return hashlib.file_digest(file, algorithm) + except AttributeError: + pass # Fallback to manual implementation below + hash = hashlib.new(algorithm) + while chunk := file.read(8192): + hash.update(chunk) + return hash + + +def _hash_file(file): + with open(file, "rb") as f: + hash = file_digest(f, "sha256") + return hash.hexdigest() + + +def _hash_files(files): + hashes = {file: _hash_file(file) for file in files} + return hashes + + +def _read_hashes(hash_file: Path): + if not hash_file.exists(): + return {} + with hash_file.open("r") as f: + lines = f.readlines() + hashes = {} + for line in lines: + hash = line[:64] + file = line[66:].strip() + hashes[file] = hash + return hashes + + +def _updated_hashes(hash_file, files_to_hash): + old_hashes = _read_hashes(hash_file) + new_hashes = _hash_files(files_to_hash) + if new_hashes != old_hashes: + return new_hashes + return None + + +@click.command() +def regenerate_version(): + """Regenerate version.py.""" + cmd = [ + sys.executable, + "-m", + "tools.generate_torch_version", + "--is-debug=false", + ] + spin.util.run(cmd) + + +TYPE_STUBS = [ + ( + "Pytorch type stubs", + Path(".lintbin/.pytorch-type-stubs.sha256"), + [ + "aten/src/ATen/native/native_functions.yaml", + "aten/src/ATen/native/tags.yaml", + "tools/autograd/deprecated.yaml", + ], + [ + sys.executable, + "-m", + "tools.pyi.gen_pyi", + "--native-functions-path", + "aten/src/ATen/native/native_functions.yaml", + "--tags-path", + "aten/src/ATen/native/tags.yaml", + "--deprecated-functions-path", + "tools/autograd/deprecated.yaml", + ], + ), + ( + "Datapipes type stubs", + None, + [], + [ + sys.executable, + "torch/utils/data/datapipes/gen_pyi.py", + ], + ), +] + + +@click.command() +def regenerate_type_stubs(): + """Regenerate type stubs.""" + for name, hash_file, files_to_hash, cmd in TYPE_STUBS: + if hash_file: + if hashes := _updated_hashes(hash_file, files_to_hash): + click.echo( + f"Changes detected in type stub files for {name}. Regenerating..." + ) + spin.util.run(cmd) + hash_file.parent.mkdir(parents=True, exist_ok=True) + with hash_file.open("w") as f: + for file, hash in hashes.items(): + f.write(f"{hash} {file}\n") + click.echo("Type stubs and hashes updated.") + else: + click.echo(f"No changes detected in type stub files for {name}.") + else: + click.echo(f"No hash file for {name}. Regenerating...") + spin.util.run(cmd) + click.echo("Type stubs regenerated.") + + +@click.command() +def regenerate_clangtidy_files(): + """Regenerate clang-tidy files.""" + cmd = [ + sys.executable, + "-m", + "tools.linter.clang_tidy.generate_build_files", + ] + spin.util.run(cmd) + + +#: These linters are expected to need less than 3s cpu time total +VERY_FAST_LINTERS = { + "ATEN_CPU_GPU_AGNOSTIC", + "BAZEL_LINTER", + "C10_NODISCARD", + "C10_UNUSED", + "CALL_ONCE", + "CMAKE_MINIMUM_REQUIRED", + "CONTEXT_DECORATOR", + "COPYRIGHT", + "CUBINCLUDE", + "DEPLOY_DETECTION", + "ERROR_PRONE_ISINSTANCE", + "EXEC", + "HEADER_ONLY_LINTER", + "IMPORT_LINTER", + "INCLUDE", + "LINTRUNNER_VERSION", + "MERGE_CONFLICTLESS_CSV", + "META_NO_CREATE_UNBACKED", + "NEWLINE", + "NOQA", + "NO_WORKFLOWS_ON_FORK", + "ONCE_FLAG", + "PYBIND11_INCLUDE", + "PYBIND11_SPECIALIZATION", + "PYPIDEP", + "PYPROJECT", + "RAWCUDA", + "RAWCUDADEVICE", + "ROOT_LOGGING", + "TABS", + "TESTOWNERS", + "TYPEIGNORE", + "TYPENOSKIP", + "WORKFLOWSYNC", +} + + +#: These linters are expected to take a few seconds, but less than 10s cpu time total +FAST_LINTERS = { + "CMAKE", + "DOCSTRING_LINTER", + "GHA", + "NATIVEFUNCTIONS", + "RUFF", + "SET_LINTER", + "SHELLCHECK", + "SPACES", +} + + +#: These linters are expected to take more than 10s cpu time total; +#: some need more than 1 hour. +SLOW_LINTERS = { + "ACTIONLINT", + "CLANGFORMAT", + "CLANGTIDY", + "CODESPELL", + "FLAKE8", + "GB_REGISTRY", + "PYFMT", + "PYREFLY", + "TEST_DEVICE_BIAS", + "TEST_HAS_MAIN", +} + + +ALL_LINTERS = VERY_FAST_LINTERS | FAST_LINTERS | SLOW_LINTERS + + +LINTRUNNER_CACHE_INFO = ( + Path(".lintbin/.lintrunner.sha256"), + [ + "requirements.txt", + "pyproject.toml", + ".lintrunner.toml", + ], +) + + +LINTRUNNER_BASE_CMD = [ + "uvx", + "--python", + "3.10", + "lintrunner@0.12.7", +] + + +@click.command() +def setup_lint(): + """Set up lintrunner with current CI version.""" + cmd = LINTRUNNER_BASE_CMD + ["init"] + subprocess.run(cmd, check=True, capture_output=True, text=True) + + +def _check_linters(): + cmd = LINTRUNNER_BASE_CMD + ["list"] + ret = spin.util.run(cmd, output=False, stderr=subprocess.PIPE) + linters = {l.strip() for l in ret.stdout.decode().strip().split("\n")[1:]} + unknown_linters = linters - ALL_LINTERS + missing_linters = ALL_LINTERS - linters + if unknown_linters: + click.secho( + f"Unknown linters found; please add them to the correct category " + f"in .spin/cmds.py: {', '.join(unknown_linters)}", + fg="yellow", + ) + if missing_linters: + click.secho( + f"Missing linters found; please update the corresponding category " + f"in .spin/cmds.py: {', '.join(missing_linters)}", + fg="yellow", + ) + return unknown_linters, missing_linters + + +@spin.util.extend_command( + setup_lint, + doc=f""" + If configuration has changed, update lintrunner. + + Compares the stored old hashes of configuration files with new ones and + performs setup via setup-lint if the hashes have changed. + Hashes are stored in {LINTRUNNER_CACHE_INFO[0]}; the following files are + considered: {", ".join(LINTRUNNER_CACHE_INFO[1])}. + """, +) +@click.pass_context +def lazy_setup_lint(ctx, parent_callback, **kwargs): + if hashes := _updated_hashes(*LINTRUNNER_CACHE_INFO): + click.echo( + "Changes detected in lint configuration files. Setting up linting tools..." + ) + parent_callback(**kwargs) + hash_file = LINTRUNNER_CACHE_INFO[0] + hash_file.parent.mkdir(parents=True, exist_ok=True) + with hash_file.open("w") as f: + for file, hash in hashes.items(): + f.write(f"{hash} {file}\n") + click.echo("Linting tools set up and hashes updated.") + else: + click.echo("No changes detected in lint configuration files. Skipping setup.") + click.echo("Regenerating version...") + ctx.invoke(regenerate_version) + click.echo("Regenerating type stubs...") + ctx.invoke(regenerate_type_stubs) + click.echo("Done.") + _check_linters() + + +@click.command() +@click.option("-a", "--apply-patches", is_flag=True) +@click.pass_context +def lint(ctx, apply_patches, **kwargs): + """Lint all files.""" + ctx.invoke(lazy_setup_lint) + all_files_linters = VERY_FAST_LINTERS | FAST_LINTERS + changed_files_linters = SLOW_LINTERS + cmd = LINTRUNNER_BASE_CMD + if apply_patches: + cmd += ["--apply-patches"] + all_files_cmd = cmd + [ + "--take", + ",".join(all_files_linters), + "--all-files", + ] + spin.util.run(all_files_cmd) + changed_files_cmd = cmd + [ + "--take", + ",".join(changed_files_linters), + ] + spin.util.run(changed_files_cmd) + + +@click.command() +@click.pass_context +def fixlint(ctx, **kwargs): + """Autofix all files.""" + ctx.invoke(lint, apply_patches=True) + + +@click.command() +@click.option("-a", "--apply-patches", is_flag=True) +@click.pass_context +def quicklint(ctx, apply_patches, **kwargs): + """Lint changed files.""" + ctx.invoke(lazy_setup_lint) + cmd = LINTRUNNER_BASE_CMD + if apply_patches: + cmd += ["--apply-patches"] + spin.util.run(cmd) + + +@click.command() +@click.pass_context +def quickfix(ctx, **kwargs): + """Autofix changed files.""" + ctx.invoke(quicklint, apply_patches=True) diff --git a/CMakeLists.txt b/CMakeLists.txt index ca1e4164be9b8..0e020abda3925 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -234,7 +234,17 @@ option(USE_COLORIZE_OUTPUT "Colorize output during compilation" ON) option(USE_ASAN "Use Address+Undefined Sanitizers" OFF) option(USE_LSAN "Use Leak Sanitizer" OFF) option(USE_TSAN "Use Thread Sanitizer" OFF) + +# Track whether USE_CUDA was explicitly set by the user (before option() is called) +# If USE_CUDA is already defined in cache, it means user explicitly set it +if(DEFINED CACHE{USE_CUDA}) + set(_USE_CUDA_EXPLICITLY_SET TRUE) +else() + set(_USE_CUDA_EXPLICITLY_SET FALSE) +endif() + option(USE_CUDA "Use CUDA" ON) + option(USE_XPU "Use XPU" ON) cmake_dependent_option( BUILD_LAZY_CUDA_LINALG "Build cuda linalg ops as separate library" ON @@ -726,6 +736,44 @@ if(NOT DEFINED USE_BLAS) set(USE_BLAS ON) endif() +# Prioritized Text Linker Optimization +if(USE_PRIORITIZED_TEXT_FOR_LD) + + set(LINKER_SCRIPT_FILE_IN "${CMAKE_SOURCE_DIR}/cmake/prioritized_text.txt") + set(LINKER_SCRIPT_FILE_OUT "${CMAKE_SOURCE_DIR}/cmake/linker_script.ld") + + execute_process( + COMMAND ${Python_EXECUTABLE} + ${CMAKE_SOURCE_DIR}/tools/setup_helpers/generate_linker_script.py + --filein "${LINKER_SCRIPT_FILE_IN}" + --fout "${LINKER_SCRIPT_FILE_OUT}" + RESULT_VARIABLE _gen_result + OUTPUT_VARIABLE _gen_output + ERROR_VARIABLE _gen_error + ) + + if(NOT _gen_result EQUAL 0) + message(FATAL_ERROR + "Failed to generate linker script:\n${_gen_output}\n${_gen_error}") + endif() + + append_cxx_flag_if_supported("-ffunction-sections" CMAKE_CXX_FLAGS) + append_cxx_flag_if_supported("-fdata-sections" CMAKE_CXX_FLAGS) + append_c_flag_if_supported("-ffunction-sections" CMAKE_C_FLAGS) + append_c_flag_if_supported("-fdata-sections" CMAKE_C_FLAGS) + + set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -T${LINKER_SCRIPT_FILE_OUT}") + set(CMAKE_MODULE_LINKER_FLAGS "${CMAKE_MODULE_LINKER_FLAGS} -T${LINKER_SCRIPT_FILE_OUT}") + +else() + if(LINUX AND CPU_AARCH64) + message(WARNING [[ + It is strongly recommend to enable linker script optimization for all AArch64 Linux builds. + To do so please export USE_PRIORITIZED_TEXT_FOR_LD=1 + ]]) + endif() +endif() + # Build libtorch mobile library, which contains ATen/TH ops and native support # for TorchScript model, but doesn't contain not-yet-unified caffe2 ops; if(INTERN_BUILD_MOBILE) @@ -1392,9 +1440,6 @@ if(BUILD_JNI) add_subdirectory(android/pytorch_android) endif() -include(cmake/Summary.cmake) -caffe2_print_configuration_summary() - # Parse custom debug info if(DEFINED USE_CUSTOM_DEBINFO) string(REPLACE ";" " " SOURCE_FILES "${USE_CUSTOM_DEBINFO}") @@ -1434,56 +1479,5 @@ if(BUILD_BUNDLE_PTXAS AND USE_CUDA) DESTINATION "${CMAKE_INSTALL_BINDIR}") endif() -if(USE_PRIORITIZED_TEXT_FOR_LD) - add_compile_options( - $<$:-ffunction-sections> - $<$:-fdata-sections> - ) - set(LINKER_SCRIPT_FILE_OUT "${CMAKE_SOURCE_DIR}/cmake/linker_script.ld") - set(LINKER_SCRIPT_FILE_IN "${CMAKE_SOURCE_DIR}/cmake/prioritized_text.txt") - - add_custom_command( - OUTPUT "${LINKER_SCRIPT_FILE_OUT}" - COMMAND ${Python_EXECUTABLE} ${CMAKE_SOURCE_DIR}/tools/setup_helpers/generate_linker_script.py --filein "${LINKER_SCRIPT_FILE_IN}" --fout "${LINKER_SCRIPT_FILE_OUT}" - DEPENDS ${CMAKE_SOURCE_DIR}/tools/setup_helpers/generate_linker_script.py "${LINKER_SCRIPT_FILE_IN}" - COMMENT "Generating prioritized text linker files" - VERBATIM - ) - - add_custom_target(generate_linker_script DEPENDS "${LINKER_SCRIPT_FILE_OUT}") - - if(BUILD_PYTHON) - set(LINKER_OPT_TARGETS torch_python) - endif() - - if(NOT BUILD_LIBTORCHLESS) - list(APPEND LINKER_OPT_TARGETS torch_cpu c10) - if(USE_CUDA) - list(APPEND LINKER_OPT_TARGETS torch_cuda c10_cuda) - endif() - if(USE_XPU) - list(APPEND LINKER_OPT_TARGETS torch_xpu c10_xpu) - endif() - if(USE_ROCM) - list(APPEND LINKER_OPT_TARGETS torch_hip c10_hip) - endif() - endif() - - foreach(tgt IN LISTS LINKER_OPT_TARGETS) - if(TARGET ${tgt}) - add_dependencies("${tgt}" generate_linker_script) - target_link_options_if_supported(${tgt} "-T,${LINKER_SCRIPT_FILE_OUT}") - set_property(TARGET ${tgt} APPEND PROPERTY LINK_DEPENDS "${LINKER_SCRIPT_FILE_OUT}") - else() - message(WARNING "Requested target '${tgt}' for linker script optimization was not found.") - endif() - endforeach() - -else() - if(LINUX AND CPU_AARCH64) - message(WARNING [[ - It is strongly recommend to enable linker script optimization for all AArch64 Linux builds. - To do so please export USE_PRIORITIZED_TEXT_FOR_LD=1 - ]]) - endif() -endif() +include(cmake/Summary.cmake) +caffe2_print_configuration_summary() diff --git a/CODEOWNERS b/CODEOWNERS index cc249dc4f43a2..137031066090e 100644 --- a/CODEOWNERS +++ b/CODEOWNERS @@ -210,8 +210,12 @@ torch/backends/cudnn/ @eqy @syed-ahmed @Aidyn-A /test/inductor/test_flex_attention.py @drisspg /test/inductor/test_flex_decoding.py @drisspg -# Low Precision GEMMs +# Low Precision & Grouped GEMMs /aten/src/ATen/native/cuda/Blas.cpp @drisspg @slayton58 +/aten/src/ATen/native/cuda/GroupedBlas.cpp @drisspg @slayton58 +/aten/src/ATen/native/cuda/ScaledBlas.cpp @drisspg @slayton58 /aten/src/ATen/cuda/CUDABlas.cpp @drisspg @slayton58 /aten/src/ATen/cuda/CUDABlas.h @drisspg @slayton58 +/aten/src/ATen/cuda/CUDAScaledBlas.cpp @drisspg @slayton58 +/aten/src/ATen/cuda/CUDAScaledBlas.h @drisspg @slayton58 /test/test_scaled_matmul_cuda.py @drisspg @slayton58 diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 9df55ca6acd5c..bc0b0fc9bb00f 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -18,7 +18,7 @@ aspects of contributing to PyTorch. - [Python Unit Testing](#python-unit-testing) - [Better local unit tests with `pytest`](#better-local-unit-tests-with-pytest) - [Local linting](#local-linting) - - [Running `mypy`](#running-mypy) + - [Running `pyrefly`](#running-pyrefly) - [C++ Unit Testing](#c-unit-testing) - [Run Specific CI Jobs](#run-specific-ci-jobs) - [Merging your Change](#merging-your-change) @@ -281,7 +281,7 @@ dependencies as well as the nightly binaries into the repo directory. **Prerequisites**: The following packages should be installed with `pip`: - `expecttest` and `hypothesis` - required to run tests -- `mypy` - recommended for linting +- `pyrefly` - recommended for type checking. [Pyrefly](https://pyrefly.org/) - `pytest` - recommended to run tests more selectively Running ``` @@ -350,15 +350,32 @@ make lint Learn more about the linter on the [lintrunner wiki page](https://github.com/pytorch/pytorch/wiki/lintrunner) -#### Running `mypy` +#### Running `pyrefly` -`mypy` is an optional static type checker for Python. We have multiple `mypy` -configs for the PyTorch codebase that are automatically validated against whenever the linter is run. +[Pyrefly](https://pyrefly.org/) is a high-performance static type checker for Python. It provides fast type checking along with IDE features like autocomplete and instant error feedback. + +PyTorch uses Pyrefly for type checking across the codebase. The configuration is managed in `pyrefly.toml` at the root of the repository. + +**Getting Started with Pyrefly:** + +To run type checking on the PyTorch codebase: +```bash +pyrefly check +``` + +For more detailed error information with summaries: +```bash +pyrefly check --summarize-errors +``` + +**Learn More:** +- [Pyrefly Configuration](https://pyrefly.org/en/docs/configuration/) - Detailed configuration options +- [Pyrefly IDE Features](https://pyrefly.org/en/docs/IDE-features/) - Set up Pyrefly in your editor for real-time type checking +- [Python Typing Tutorial](https://pyrefly.org/en/docs/typing-for-python-developers/) - Learn about Python type annotations See [Guide for adding type annotations to PyTorch](https://github.com/pytorch/pytorch/wiki/Guide-for-adding-type-annotations-to-PyTorch) -for more information on how to set up `mypy` and tackle type annotation -tasks. +for PyTorch-specific guidance on how to set up `pyrefly` and tackle type annotation tasks in this codebase. ### C++ Unit Testing diff --git a/LICENSE b/LICENSE index 966a609b61e53..c23172f7aff02 100644 --- a/LICENSE +++ b/LICENSE @@ -37,7 +37,7 @@ Copyright (c) 2024 Tri Dao. All rights reserved. All contributions by Arm: -Copyright (c) 2021, 2023-2024 Arm Limited and/or its affiliates +Copyright (c) 2021, 2023-2025 Arm Limited and/or its affiliates All contributions from Caffe: Copyright(c) 2013, 2014, 2015, the respective contributors diff --git a/SECURITY.md b/SECURITY.md index ed8228af36724..2d2c8a0c5f1c5 100644 --- a/SECURITY.md +++ b/SECURITY.md @@ -1,7 +1,7 @@ # Security Policy - [**Reporting a Vulnerability**](#reporting-a-vulnerability) - - [**Using Pytorch Securely**](#using-pytorch-securely) + - [**Using PyTorch Securely**](#using-pytorch-securely) - [Untrusted models](#untrusted-models) - [TorchScript models](#torchscript-models) - [Untrusted inputs](#untrusted-inputs) @@ -10,28 +10,30 @@ - [**CI/CD security principles**](#cicd-security-principles) ## Reporting Security Issues -Beware that none of the topics under [Using Pytorch Securely](#using-pytorch-securely) are considered vulnerabilities of Pytorch. +Beware that none of the topics under [Using PyTorch Securely](#using-pytorch-securely) are considered vulnerabilities of PyTorch. However, if you believe you have found a security vulnerability in PyTorch, we encourage you to let us know right away. We will investigate all legitimate reports and do our best to quickly fix the problem. Please report security issues using https://github.com/pytorch/pytorch/security/advisories/new -All reports submitted thru the security advisories mechanism would **either be made public or dismissed by the team within 90 days of the submission**. If advisory has been closed on the grounds that it is not a security issue, please do not hesitate to create an [new issue](https://github.com/pytorch/pytorch/issues/new?template=bug-report.yml) as it is still likely a valid issue within the framework. +All reports submitted through the security advisories mechanism would **either be made public or dismissed by the team within 90 days of the submission**. If advisory has been closed on the grounds that it is not a security issue, please do not hesitate to create an [new issue](https://github.com/pytorch/pytorch/issues/new?template=bug-report.yml) as it is still likely a valid issue within the framework. + +**Note on crashes and out of bounds access**: PyTorch is a computational framework that performs operations on behalf of the caller. Like many low-level libraries, PyTorch generally does not validate all inputs to every function—the responsibility for providing valid arguments lies with the calling code. While crashes and out of bounds memory access should be reported as bugs, they are generally not considered security vulnerabilities in PyTorch's threat model. Please refer to the following page for our responsible disclosure policy, reward guidelines, and those things that should not be reported: https://www.facebook.com/whitehat -## Using Pytorch Securely -**Pytorch models are programs**, so treat its security seriously -- running untrusted models is equivalent to running untrusted code. In general we recommend that model weights and the python code for the model are distributed independently. That said, be careful about where you get the python code from and who wrote it (preferentially check for a provenance or checksums, do not run any pip installed package). +## Using PyTorch Securely +**PyTorch models are programs**, so treat its security seriously -- running untrusted models is equivalent to running untrusted code. In general we recommend that model weights and the python code for the model are distributed independently. That said, be careful about where you get the python code from and who wrote it (preferentially check for a provenance or checksums, do not run any pip installed package). ### Untrusted models Be careful when running untrusted models. This classification includes models created by unknown developers or utilizing data obtained from unknown sources[^data-poisoning-sources]. **Prefer to execute untrusted models within a secure, isolated environment such as a sandbox** (e.g., containers, virtual machines). This helps protect your system from potentially malicious code. You can find further details and instructions in [this page](https://developers.google.com/code-sandboxing). -**Be mindful of risky model formats**. Give preference to share and load weights with the appropriate format for your use case. [safetensors](https://huggingface.co/docs/safetensors/en/index) gives the most safety but is the most restricted in what it supports. [`torch.load`](https://pytorch.org/docs/stable/generated/torch.load.html#torch.load) has a significantly larger surface of attack but is more flexible in what it can serialize. See the documentation for more details. +**Be mindful of risky model formats**. Give preference to share and load weights with the appropriate format for your use case. [Safetensors](https://huggingface.co/docs/safetensors/en/index) gives the most safety but is the most restricted in what it supports. [`torch.load`](https://pytorch.org/docs/stable/generated/torch.load.html#torch.load) has a significantly larger surface of attack but is more flexible in what it can serialize. See the documentation for more details. Even for more secure serialization formats, unexpected inputs to the downstream system can cause diverse security threats (e.g. denial of service, out of bound reads/writes) and thus we recommend extensive validation of any untrusted inputs. @@ -43,7 +45,7 @@ Important Note: The trustworthiness of a model is not binary. You must always de ### TorchScript models -TorchScript models should treated the same way as locally executable code from an unknown source. Only run TorchScript models if you trust the provider. Please note, that tools for introspecting TorchScript models (such as `torch.utils.model_dump`) may also execute partial or full code stored in those models, therefore they should be used only if you trust the provider of the binary you are about to load. +TorchScript models should be treated the same way as locally executable code from an unknown source. Only run TorchScript models if you trust the provider. Please note, that tools for introspecting TorchScript models (such as `torch.utils.model_dump`) may also execute partial or full code stored in those models, therefore they should be used only if you trust the provider of the binary you are about to load. ### Untrusted inputs during training and prediction @@ -59,9 +61,9 @@ If applicable, prepare your model against bad inputs and prompt injections. Some ### Data privacy -**Take special security measures if your model if you train models with sensitive data**. Prioritize [sandboxing](https://developers.google.com/code-sandboxing) your models and: -- Do not feed sensitive data to untrusted model (even if runs in a sandboxed environment) -- If you consider publishing a model that was partially trained with sensitive data, be aware that data can potentially be recovered from the trained weights (especially if model overfits). +**Take special security measures if you train your models with sensitive data**. Prioritize [sandboxing](https://developers.google.com/code-sandboxing) your models and: +- Do not feed sensitive data to an untrusted model (even if runs in a sandboxed environment) +- If you consider publishing a model that was partially trained with sensitive data, be aware that data can potentially be recovered from the trained weights (especially if the model overfits). ### Using distributed features diff --git a/aten/src/ATen/CMakeLists.txt b/aten/src/ATen/CMakeLists.txt index 8b283c417b74b..ae762e1def3ec 100644 --- a/aten/src/ATen/CMakeLists.txt +++ b/aten/src/ATen/CMakeLists.txt @@ -260,7 +260,7 @@ IF(USE_FBGEMM_GENAI) if(USE_CUDA) # To avoid increasing the build time/binary size unnecessarily, use an allow-list of kernels to build. # If you want to integrate a kernel from FBGEMM into torch, you have to add it here. - set(FBGEMM_CUTLASS_KERNELS_REGEX ".*(mx8mx8bf16_grouped|f4f4bf16_grouped).*") + set(FBGEMM_CUTLASS_KERNELS_REGEX ".*(mx8mx8bf16_grouped|f4f4bf16_grouped|f4f4bf16).*") file(GLOB_RECURSE fbgemm_genai_native_cuda_cu "${FBGEMM_GENAI_SRCS}/cutlass_extensions/*.cu" "${FBGEMM_GENAI_SRCS}/cutlass_extensions/**/*.cu") diff --git a/aten/src/ATen/Context.cpp b/aten/src/ATen/Context.cpp index a354b41912406..6bc321887502d 100644 --- a/aten/src/ATen/Context.cpp +++ b/aten/src/ATen/Context.cpp @@ -23,8 +23,6 @@ C10_DIAGNOSTIC_POP() #endif namespace at { -namespace { - /* These const variables defined the fp32 precisions for different backend We have "generic", "cuda", "mkldnn" backend now and we can choose fp32 @@ -41,16 +39,6 @@ namespace { ->rnn */ - C10_ALWAYS_INLINE void warn_deprecated_fp32_precision_api(){ - TORCH_WARN_ONCE( - "Please use the new API settings to control TF32 behavior, such as torch.backends.cudnn.conv.fp32_precision = 'tf32' " - "or torch.backends.cuda.matmul.fp32_precision = 'ieee'. Old settings, e.g, torch.backends.cuda.matmul.allow_tf32 = True, " - "torch.backends.cudnn.allow_tf32 = True, allowTF32CuDNN() and allowTF32CuBLAS() will be deprecated after Pytorch 2.9. Please see " - "https://pytorch.org/docs/main/notes/cuda.html#tensorfloat-32-tf32-on-ampere-and-later-devices" - ); - } -} // namespace - Float32Backend str2backend(const std::string& name) { if (name == "generic") return Float32Backend::GENERIC; @@ -206,7 +194,6 @@ bool Context::allowTF32CuDNN(std::optional op) const { } else { return float32Precision(Float32Backend::CUDA, op.value()) == Float32Precision::TF32; } - warn_deprecated_fp32_precision_api(); return allow_tf32_cudnn; } @@ -214,7 +201,6 @@ void Context::setAllowTF32CuDNN(bool b) { setFloat32Precision(Float32Backend::CUDA, Float32Op::RNN, b ? Float32Precision::TF32 : Float32Precision::NONE); setFloat32Precision(Float32Backend::CUDA, Float32Op::CONV, b ? Float32Precision::TF32 : Float32Precision::NONE); allow_tf32_cudnn = b; - warn_deprecated_fp32_precision_api(); } void Context::setSDPPriorityOrder(const std::vector& order) { @@ -325,7 +311,6 @@ bool Context::allowTF32CuBLAS() const { "Current status indicate that you have used mix of the legacy and new APIs to set the TF32 status for cublas matmul. ", "We suggest only using the new API to set the TF32 flag. See also: ", "https://pytorch.org/docs/main/notes/cuda.html#tensorfloat-32-tf32-on-ampere-and-later-devices"); - warn_deprecated_fp32_precision_api(); return allow_tf32_new; } @@ -349,7 +334,6 @@ Float32MatmulPrecision Context::float32MatmulPrecision() const { "Current status indicate that you have used mix of the legacy and new APIs to set the matmul precision. ", "We suggest only using the new API for matmul precision. See also: ", "https://pytorch.org/docs/main/notes/cuda.html#tensorfloat-32-tf32-on-ampere-and-later-devices"); - warn_deprecated_fp32_precision_api(); return float32_matmul_precision; } @@ -377,7 +361,6 @@ Float32Precision Context::float32Precision(Float32Backend backend, Float32Op op) void Context::setFloat32MatmulPrecision(const std::string &s) { auto match = [this](const std::string & s_) { - warn_deprecated_fp32_precision_api(); // TODO: consider if CuDNN field needs to also be set for potential future CuDNN ops like multi-headed attention if (s_ == "highest") { float32_matmul_precision = at::Float32MatmulPrecision::HIGHEST; diff --git a/aten/src/ATen/Context.h b/aten/src/ATen/Context.h index 6807e527eb75f..385ccb88c463b 100644 --- a/aten/src/ATen/Context.h +++ b/aten/src/ATen/Context.h @@ -174,6 +174,12 @@ class TORCH_API Context { static long versionCuDNN() { return detail::getCUDAHooks().versionCuDNN(); } + static long versionRuntimeCuDNN() { + return detail::getCUDAHooks().versionRuntimeCuDNN(); + } + static long versionCuDNNFrontend() { + return detail::getCUDAHooks().versionCuDNNFrontend(); + } static bool hasCuSOLVER() { return detail::getCUDAHooks().hasCuSOLVER(); } diff --git a/aten/src/ATen/DeviceAccelerator.h b/aten/src/ATen/DeviceAccelerator.h index f23b35047fcc8..2cc4cff7cd1f2 100644 --- a/aten/src/ATen/DeviceAccelerator.h +++ b/aten/src/ATen/DeviceAccelerator.h @@ -94,6 +94,11 @@ TORCH_API inline void resetPeakStats(c10::DeviceIndex device_index) { at::getDeviceAllocator(device_type)->resetPeakStats(device_index); } +TORCH_API inline std::pair getMemoryInfo( + c10::DeviceIndex device_index) { + const auto device_type = getAccelerator(true).value(); + return at::getDeviceAllocator(device_type)->getMemoryInfo(device_index); +} } // namespace at::accelerator namespace at { diff --git a/aten/src/ATen/Dispatch.h b/aten/src/ATen/Dispatch.h index 40ad61cbd6455..870f7172d1622 100644 --- a/aten/src/ATen/Dispatch.h +++ b/aten/src/ATen/Dispatch.h @@ -6,6 +6,7 @@ #include #include #include +#include #ifdef __CUDACC__ #include // For CUDA_VERSION @@ -61,12 +62,9 @@ TORCH_API void record_kernel_function_dtype(std::string name); } \ } while (0) -#define AT_PRIVATE_CASE_TYPE_USING_HINT(enum_type, HINT, ...) \ - case enum_type: { \ - AT_PRIVATE_CHECK_SELECTIVE_BUILD(enum_type); \ - using HINT [[maybe_unused]] = c10::impl::ScalarTypeToCPPTypeT; \ - return __VA_ARGS__(); \ - } +#define AT_PRIVATE_CASE_TYPE_USING_HINT(enum_type, HINT, ...) \ + THO_PRIVATE_CASE_TYPE_USING_HINT_TMPL( \ + AT_PRIVATE_CHECK_SELECTIVE_BUILD, enum_type, HINT, __VA_ARGS__) #define AT_DISPATCH_CASE(enum_type, ...) \ AT_PRIVATE_CASE_TYPE_USING_HINT(enum_type, scalar_t, __VA_ARGS__) @@ -95,14 +93,6 @@ TORCH_API void record_kernel_function_dtype(std::string name); return __VA_ARGS__(); \ } -namespace detail { - -inline at::ScalarType scalar_type(at::ScalarType s) { - return s; -} - -} // namespace detail - // The AT_DISPATCH_* family of macros provides the ability to // conveniently generate specializations of a kernel over all of the // dtypes we care about in PyTorch. We call it "dispatch" because @@ -190,27 +180,13 @@ inline at::ScalarType scalar_type(at::ScalarType s) { // but we're just being safe (and it doesn't hurt.) Note we must // use it to shut up warnings about unused store. -#define AT_DISPATCH_SWITCH(TYPE, NAME, ...) \ - [&] { \ - const auto& the_type = TYPE; \ - constexpr const char* at_dispatch_name = NAME; \ - /* don't use TYPE again in case it is an expensive or side-effect op */ \ - at::ScalarType _st = ::detail::scalar_type(the_type); \ - RECORD_KERNEL_FUNCTION_DTYPE(at_dispatch_name, _st); \ - C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wswitch-enum") \ - switch (_st) { \ - __VA_ARGS__ \ - default: \ - TORCH_CHECK_NOT_IMPLEMENTED( \ - false, \ - '"', \ - at_dispatch_name, \ - "\" not implemented for '", \ - toString(_st), \ - "'"); \ - } \ - C10_DIAGNOSTIC_POP() \ - }() +#define AT_DISPATCH_SWITCH(TYPE, NAME, ...) \ + THO_DISPATCH_SWITCH_TMPL( \ + RECORD_KERNEL_FUNCTION_DTYPE, \ + TORCH_CHECK_NOT_IMPLEMENTED, \ + TYPE, \ + NAME, \ + __VA_ARGS__) #define AT_DISPATCH_CASE_FLOATING_TYPES(...) \ AT_DISPATCH_CASE(at::ScalarType::Double, __VA_ARGS__) \ diff --git a/aten/src/ATen/Dispatch_v2.h b/aten/src/ATen/Dispatch_v2.h index d0b77220faef2..fbeb48d45e32a 100644 --- a/aten/src/ATen/Dispatch_v2.h +++ b/aten/src/ATen/Dispatch_v2.h @@ -1,3 +1,8 @@ +#pragma once + +#include + +// Get AT_DISPATCH_SWITCH and AT_DISPATCH_CASE: #include // This is a new implementation of the AT_DISPATCH macro family from @@ -74,41 +79,19 @@ // macro expansion occurs, mediated with AT_EXPAND and AT_GUARD. I mostly // relied on GPT4 to help me get it right. -// Public API macros - // See documentation above #define AT_DISPATCH_V2(TYPE, NAME, BODY, ...) \ - AT_DISPATCH_SWITCH(TYPE, NAME, AT_AP_VAR(AT_WRAP(BODY), TYPE, __VA_ARGS__)) - -// This macro lets you pass an arbitrary expression that may contain internal -// commas to another macro without having the commas causing the expression -// to be interpreted as being multiple arguments -#define AT_WRAP(...) __VA_ARGS__ - -#define AT_FLOAT8_TYPES \ - c10::kFloat8_e5m2, c10::kFloat8_e5m2fnuz, c10::kFloat8_e4m3fn, \ - c10::kFloat8_e4m3fnuz, c10::kFloat8_e8m0fnu - -#define AT_INTEGRAL_TYPES \ - c10::kByte, c10::kChar, c10::kInt, c10::kLong, c10::kShort -#define AT_FLOATING_TYPES c10::kDouble, c10::kFloat -#define AT_BAREBONES_UNSIGNED_TYPES c10::kUInt16, c10::kUInt32, c10::kUInt64 -#define AT_INTEGRAL_TYPES_V2 \ - AT_EXPAND(AT_INTEGRAL_TYPES), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES) -#define AT_COMPLEX_TYPES c10::kComplexDouble, c10::kComplexFloat -#define AT_QINT_TYPES c10::kQInt8, c10::kQUInt8, c10::kQInt32 -// NB: not *actually* all types -#define AT_ALL_TYPES AT_EXPAND(AT_INTEGRAL_TYPES), AT_EXPAND(AT_FLOATING_TYPES) -#define AT_ALL_TYPES_AND_COMPLEX \ - AT_EXPAND(AT_ALL_TYPES), AT_EXPAND(AT_COMPLEX_TYPES) - -// Helper macros - + THO_DISPATCH_V2_TMPL( \ + AT_DISPATCH_SWITCH, \ + AT_DISPATCH_CASE, \ + TYPE, \ + NAME, \ + AT_WRAP(BODY), \ + __VA_ARGS__) + +// Unused helper macros, kept for BC: #define AT_AP_VAR(N, T, ...) \ AT_EXPAND(AT_CONCAT(AT_AP, AT_NUM_ARGS(__VA_ARGS__))(AT_WRAP(N), __VA_ARGS__)) -#define AT_CONCAT(a, b) AT_CONCAT_AUX(a, b) -#define AT_CONCAT_AUX(a, b) a##b -#define AT_EXPAND(X) X // Ensure we never have too many scalar types for the expansion here to // support. To bump this, you must regenerate the macros below. @@ -119,12 +102,6 @@ static_assert(static_cast(c10::ScalarType::NumOptions) < 60); num_args = 60 -nums = ', '.join(str(i) for i in reversed(range(num_args+1))) -args = ', '.join(f'_{i}' for i in range(1, num_args+1)) - -print(f'#define AT_NUM_ARGS(...) AT_EXPAND(AT_NUM_ARGS_AUX(__VA_ARGS__, {nums}))') -print(f'#define AT_NUM_ARGS_AUX({args}, N, ...) N') - for i in range(1, num_args+1): args = ', '.join(f'_{i}' for i in range(1, i+1)) cases = ' '.join([f'AT_DISPATCH_CASE(_{j}, N)' for j in range(1, i+1)]) @@ -135,8 +112,6 @@ for i in range(1, num_args+1): // Begin generated code // clang-format off -#define AT_NUM_ARGS(...) AT_EXPAND(AT_NUM_ARGS_AUX(__VA_ARGS__, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)) -#define AT_NUM_ARGS_AUX(_1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27, _28, _29, _30, _31, _32, _33, _34, _35, _36, _37, _38, _39, _40, _41, _42, _43, _44, _45, _46, _47, _48, _49, _50, _51, _52, _53, _54, _55, _56, _57, _58, _59, _60, N, ...) N #define AT_AP1(N, _1) AT_DISPATCH_CASE(_1, N) #define AT_AP2(N, _1, _2) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) #define AT_AP3(N, _1, _2, _3) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) diff --git a/aten/src/ATen/core/CachingHostAllocator.h b/aten/src/ATen/core/CachingHostAllocator.h index 603e7e73bc1ea..71af40c5fd20a 100644 --- a/aten/src/ATen/core/CachingHostAllocator.h +++ b/aten/src/ATen/core/CachingHostAllocator.h @@ -226,8 +226,8 @@ template < typename B = HostBlock> struct CachingHostAllocatorImpl { virtual ~CachingHostAllocatorImpl() { - active_ = false; - if (pinned_use_background_threads()) { + if (active_) { + active_ = false; getBackgroundThreadPool()->waitWorkComplete(); } } @@ -260,6 +260,7 @@ struct CachingHostAllocatorImpl { if (pinned_use_background_threads()) { // Launch the background thread and process events in a loop. static bool background_thread_flag [[maybe_unused]] = [this] { + active_ = true; getBackgroundThreadPool()->run([&]() { while (active_) { process_events(); @@ -683,9 +684,9 @@ struct CachingHostAllocatorImpl { alignas(hardware_destructive_interference_size) std::mutex events_mutex_; std::deque> events_; // event queue paired with block - // Indicates whether the object is active. + // Indicates whether the event-processing thread pool is active. // Set to false in the destructor to signal background threads to stop. - std::atomic active_{true}; + std::atomic active_{false}; protected: alignas(hardware_destructive_interference_size) HostStatsStaged stats_; }; diff --git a/aten/src/ATen/core/ivalue.h b/aten/src/ATen/core/ivalue.h index f13b0613691b4..73aed03da073d 100644 --- a/aten/src/ATen/core/ivalue.h +++ b/aten/src/ATen/core/ivalue.h @@ -18,6 +18,8 @@ #include #include +C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wswitch-default") + namespace torch { class TORCH_API CustomClassHolder : public c10::intrusive_ptr_target {}; namespace jit { @@ -1630,4 +1632,6 @@ struct TORCH_API WeakOrStrongTypePtr { } // namespace c10 +C10_DIAGNOSTIC_POP() + #include // IWYU pragma: keep diff --git a/aten/src/ATen/core/ivalue_inl.h b/aten/src/ATen/core/ivalue_inl.h index 8d1c3aa83dadb..ac7540cffd18f 100644 --- a/aten/src/ATen/core/ivalue_inl.h +++ b/aten/src/ATen/core/ivalue_inl.h @@ -29,6 +29,8 @@ #include #include +C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wswitch-default") + namespace torch { namespace jit { struct Function; @@ -2567,3 +2569,5 @@ TypePtr IValue::type() const { } } // namespace c10 + +C10_DIAGNOSTIC_POP() diff --git a/aten/src/ATen/cpu/vec/sve/vec_bfloat16.h b/aten/src/ATen/cpu/vec/sve/vec_bfloat16.h index 9e0b189bdac89..757ef839f965a 100644 --- a/aten/src/ATen/cpu/vec/sve/vec_bfloat16.h +++ b/aten/src/ATen/cpu/vec/sve/vec_bfloat16.h @@ -191,7 +191,7 @@ class Vectorized { auto vals = svreinterpret_u16_bf16(values); vals = sveor_u16_x(ptrue, vals, mask); return svreinterpret_bf16_u16(vals); - }; + } Vectorized round() const; Vectorized tan() const; Vectorized tanh() const; @@ -349,47 +349,47 @@ Vectorized inline Vectorized::frac() const { return convert_float_bfloat16(v1, v2); \ } -DEFINE_BF16_FUNC_VIA_FLOAT(isnan); -DEFINE_BF16_FUNC_VIA_FLOAT(angle); -DEFINE_BF16_FUNC_VIA_FLOAT(acos); -DEFINE_BF16_FUNC_VIA_FLOAT(acosh); -DEFINE_BF16_FUNC_VIA_FLOAT(asin); -DEFINE_BF16_FUNC_VIA_FLOAT(atan); -DEFINE_BF16_FUNC_VIA_FLOAT(atanh); -DEFINE_BF16_FUNC_VIA_FLOAT_W_ARG(atan2); -DEFINE_BF16_FUNC_VIA_FLOAT_W_ARG(copysign); -DEFINE_BF16_FUNC_VIA_FLOAT(erf); -DEFINE_BF16_FUNC_VIA_FLOAT(erfc); -DEFINE_BF16_FUNC_VIA_FLOAT(exp); -DEFINE_BF16_FUNC_VIA_FLOAT(exp2); -DEFINE_BF16_FUNC_VIA_FLOAT(expm1); -DEFINE_BF16_FUNC_VIA_FLOAT_W_ARG(fmod); -DEFINE_BF16_FUNC_VIA_FLOAT_W_ARG(hypot); -DEFINE_BF16_FUNC_VIA_FLOAT(i0); -DEFINE_BF16_FUNC_VIA_FLOAT(i0e); -DEFINE_BF16_FUNC_VIA_FLOAT(digamma); -DEFINE_BF16_FUNC_VIA_FLOAT_W_ARG(igamma); -DEFINE_BF16_FUNC_VIA_FLOAT_W_ARG(igammac); -DEFINE_BF16_FUNC_VIA_FLOAT_W_ARG(nextafter); -DEFINE_BF16_FUNC_VIA_FLOAT(log); -DEFINE_BF16_FUNC_VIA_FLOAT(log2); -DEFINE_BF16_FUNC_VIA_FLOAT(log10); -DEFINE_BF16_FUNC_VIA_FLOAT(log1p); -DEFINE_BF16_FUNC_VIA_FLOAT(sin); -DEFINE_BF16_FUNC_VIA_FLOAT(sinh); -DEFINE_BF16_FUNC_VIA_FLOAT(cos); -DEFINE_BF16_FUNC_VIA_FLOAT(cosh); -DEFINE_BF16_FUNC_VIA_FLOAT(ceil); -DEFINE_BF16_FUNC_VIA_FLOAT(floor); -DEFINE_BF16_FUNC_VIA_FLOAT(round); -DEFINE_BF16_FUNC_VIA_FLOAT(tan); -DEFINE_BF16_FUNC_VIA_FLOAT(tanh); -DEFINE_BF16_FUNC_VIA_FLOAT(trunc); -DEFINE_BF16_FUNC_VIA_FLOAT(lgamma); -DEFINE_BF16_FUNC_VIA_FLOAT(sqrt); -DEFINE_BF16_FUNC_VIA_FLOAT(reciprocal); -DEFINE_BF16_FUNC_VIA_FLOAT(rsqrt); -DEFINE_BF16_FUNC_VIA_FLOAT_W_ARG(pow); +DEFINE_BF16_FUNC_VIA_FLOAT(isnan) +DEFINE_BF16_FUNC_VIA_FLOAT(angle) +DEFINE_BF16_FUNC_VIA_FLOAT(acos) +DEFINE_BF16_FUNC_VIA_FLOAT(acosh) +DEFINE_BF16_FUNC_VIA_FLOAT(asin) +DEFINE_BF16_FUNC_VIA_FLOAT(atan) +DEFINE_BF16_FUNC_VIA_FLOAT(atanh) +DEFINE_BF16_FUNC_VIA_FLOAT_W_ARG(atan2) +DEFINE_BF16_FUNC_VIA_FLOAT_W_ARG(copysign) +DEFINE_BF16_FUNC_VIA_FLOAT(erf) +DEFINE_BF16_FUNC_VIA_FLOAT(erfc) +DEFINE_BF16_FUNC_VIA_FLOAT(exp) +DEFINE_BF16_FUNC_VIA_FLOAT(exp2) +DEFINE_BF16_FUNC_VIA_FLOAT(expm1) +DEFINE_BF16_FUNC_VIA_FLOAT_W_ARG(fmod) +DEFINE_BF16_FUNC_VIA_FLOAT_W_ARG(hypot) +DEFINE_BF16_FUNC_VIA_FLOAT(i0) +DEFINE_BF16_FUNC_VIA_FLOAT(i0e) +DEFINE_BF16_FUNC_VIA_FLOAT(digamma) +DEFINE_BF16_FUNC_VIA_FLOAT_W_ARG(igamma) +DEFINE_BF16_FUNC_VIA_FLOAT_W_ARG(igammac) +DEFINE_BF16_FUNC_VIA_FLOAT_W_ARG(nextafter) +DEFINE_BF16_FUNC_VIA_FLOAT(log) +DEFINE_BF16_FUNC_VIA_FLOAT(log2) +DEFINE_BF16_FUNC_VIA_FLOAT(log10) +DEFINE_BF16_FUNC_VIA_FLOAT(log1p) +DEFINE_BF16_FUNC_VIA_FLOAT(sin) +DEFINE_BF16_FUNC_VIA_FLOAT(sinh) +DEFINE_BF16_FUNC_VIA_FLOAT(cos) +DEFINE_BF16_FUNC_VIA_FLOAT(cosh) +DEFINE_BF16_FUNC_VIA_FLOAT(ceil) +DEFINE_BF16_FUNC_VIA_FLOAT(floor) +DEFINE_BF16_FUNC_VIA_FLOAT(round) +DEFINE_BF16_FUNC_VIA_FLOAT(tan) +DEFINE_BF16_FUNC_VIA_FLOAT(tanh) +DEFINE_BF16_FUNC_VIA_FLOAT(trunc) +DEFINE_BF16_FUNC_VIA_FLOAT(lgamma) +DEFINE_BF16_FUNC_VIA_FLOAT(sqrt) +DEFINE_BF16_FUNC_VIA_FLOAT(reciprocal) +DEFINE_BF16_FUNC_VIA_FLOAT(rsqrt) +DEFINE_BF16_FUNC_VIA_FLOAT_W_ARG(pow) Vectorized inline Vectorized::operator==( const Vectorized& other) const { diff --git a/aten/src/ATen/cpu/vec/vec128/vec128_convert.h b/aten/src/ATen/cpu/vec/vec128/vec128_convert.h index e968389987fc5..060d60fa3e2d8 100644 --- a/aten/src/ATen/cpu/vec/vec128/vec128_convert.h +++ b/aten/src/ATen/cpu/vec/vec128/vec128_convert.h @@ -223,6 +223,62 @@ CONVERT_FROM_BF16_TEMPLATE(double) CONVERT_FROM_BF16_TEMPLATE(float16_t) #endif +#ifdef __ARM_FEATURE_BF16 + +// clang-[17, 20] crashes when autovectorizing static cast to bf16 +// Below is a workaround to have some vectorization +// Works decently well for smaller int types +template +inline void convertToBf16Impl( + const from_type* __restrict src, + c10::BFloat16* __restrict dst, + uint64_t n) { + bfloat16_t* dstPtr = reinterpret_cast(dst); + uint64_t loopBound = n - (n % 16); + uint64_t i = 0; + for (; i < loopBound; i += 16) { + float32x4_t a, b, c, d; + a[0] = static_cast(src[i]); + a[1] = static_cast(src[i + 1]); + a[2] = static_cast(src[i + 2]); + a[3] = static_cast(src[i + 3]); + b[0] = static_cast(src[i + 4]); + b[1] = static_cast(src[i + 5]); + b[2] = static_cast(src[i + 6]); + b[3] = static_cast(src[i + 7]); + c[0] = static_cast(src[i + 8]); + c[1] = static_cast(src[i + 9]); + c[2] = static_cast(src[i + 10]); + c[3] = static_cast(src[i + 11]); + d[0] = static_cast(src[i + 12]); + d[1] = static_cast(src[i + 13]); + d[2] = static_cast(src[i + 14]); + d[3] = static_cast(src[i + 15]); + + vst1q_bf16(dstPtr + i, vcvtq_high_bf16_f32(vcvtq_low_bf16_f32(a), b)); + vst1q_bf16(dstPtr + i + 8, vcvtq_high_bf16_f32(vcvtq_low_bf16_f32(c), d)); + } + +#pragma clang loop vectorize(disable) interleave(disable) unroll(disable) + for (; i < n; i++) { + float a = static_cast(src[i]); + dstPtr[i] = vcvth_bf16_f32(a); + } +} + +#define CONVERT_TO_BF16_TEMPLATE(from_type) \ + template <> \ + inline void convert(const from_type* src, c10::BFloat16* dst, int64_t n) { \ + return convertToBf16Impl(src, dst, n); \ + } + +CONVERT_TO_BF16_TEMPLATE(uint8_t) +CONVERT_TO_BF16_TEMPLATE(int8_t) +CONVERT_TO_BF16_TEMPLATE(int16_t) +CONVERT_TO_BF16_TEMPLATE(int32_t) + +#endif + inline void convertBoolToBfloat16Impl( const bool* __restrict src, c10::BFloat16* __restrict dst, diff --git a/aten/src/ATen/cpu/vec/vec128/vec128_float_neon.h b/aten/src/ATen/cpu/vec/vec128/vec128_float_neon.h index c479fc2e4aeb2..6a64226475cf3 100644 --- a/aten/src/ATen/cpu/vec/vec128/vec128_float_neon.h +++ b/aten/src/ATen/cpu/vec/vec128/vec128_float_neon.h @@ -11,6 +11,8 @@ #include #endif +C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wswitch-default") + // Sleef offers vectorized versions of some transcedentals // such as sin, cos, tan etc.. // However for now opting for STL, since we are not building @@ -650,3 +652,5 @@ inline Vectorized Vectorized::erf() const { } // namespace CPU_CAPABILITY } // namespace at::vec + +C10_DIAGNOSTIC_POP() diff --git a/aten/src/ATen/cuda/CUDABlas.cpp b/aten/src/ATen/cuda/CUDABlas.cpp index aaed431064611..9a55b058001da 100644 --- a/aten/src/ATen/cuda/CUDABlas.cpp +++ b/aten/src/ATen/cuda/CUDABlas.cpp @@ -388,6 +388,7 @@ static inline bool bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(D #ifndef USE_ROCM at::Half halpha; at::Half hbeta; + uint32_t mask = -1; #endif void * alpha_ptr = α void * beta_ptr = β @@ -427,7 +428,7 @@ static inline bool bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(D auto fp16_reduction = at::globalContext().allowFP16ReductionCuBLAS(); if (fp16_reduction != at::CuBLASReductionOption::AllowReducedPrecisionWithSplitK) { - uint32_t mask = + mask = fp16_reduction == at::CuBLASReductionOption::DisallowReducedPrecisionAllowSplitK ? (CUBLASLT_REDUCTION_SCHEME_COMPUTE_TYPE | @@ -444,7 +445,7 @@ static inline bool bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(D auto bf16_reduction = at::globalContext().allowBF16ReductionCuBLAS(); if (bf16_reduction != at::CuBLASReductionOption::AllowReducedPrecisionWithSplitK) { - uint32_t mask = + mask = bf16_reduction == at::CuBLASReductionOption::DisallowReducedPrecisionAllowSplitK ? (CUBLASLT_REDUCTION_SCHEME_COMPUTE_TYPE | @@ -511,17 +512,41 @@ static inline bool bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(D cublasStatus_t cublasStatus = CUBLAS_STATUS_SUCCESS; cublasLtMatmulHeuristicResult_t heuristicResult = {}; int returnedResult = 0; - TORCH_CUDABLAS_CHECK(cublasLtMatmulAlgoGetHeuristic( - ltHandle, - computeDesc.descriptor(), - Adesc.descriptor(), - Bdesc.descriptor(), - Cdesc.descriptor(), - Cdesc.descriptor(), - preference.descriptor(), - 1, - &heuristicResult, - &returnedResult)); + // on Blackwell+, we fake a n > 1 matmul when querying heuristics + // to prevent cuBLASLt from dispatching to a GEMV kernel for batch-invariance +#ifndef USE_ROCM + const bool lie_to_cublaslt = mask == CUBLASLT_REDUCTION_SCHEME_NONE && n == 1 && at::cuda::getCurrentDeviceProperties()->major >= 10; +#else + const bool lie_to_cublaslt = false; +#endif + if (lie_to_cublaslt) { + CuBlasLtMatrixLayout FakeBdesc(abType, k, 2, ldb, opb == CUBLAS_OP_T); + CuBlasLtMatrixLayout FakeCdesc(cType, m, 2, ldc); + + TORCH_CUDABLAS_CHECK(cublasLtMatmulAlgoGetHeuristic( + ltHandle, + computeDesc.descriptor(), + Adesc.descriptor(), + FakeBdesc.descriptor(), + FakeCdesc.descriptor(), + FakeCdesc.descriptor(), + preference.descriptor(), + 1, + &heuristicResult, + &returnedResult)); + } else { + TORCH_CUDABLAS_CHECK(cublasLtMatmulAlgoGetHeuristic( + ltHandle, + computeDesc.descriptor(), + Adesc.descriptor(), + Bdesc.descriptor(), + Cdesc.descriptor(), + Cdesc.descriptor(), + preference.descriptor(), + 1, + &heuristicResult, + &returnedResult)); + } if (returnedResult == 0) { cublasStatus = CUBLAS_STATUS_NOT_SUPPORTED; } @@ -1572,7 +1597,7 @@ bool gemm_and_bias( } using opmath_t = at::opmath_type; - opmath_t beta_val = 0; // bias is added in epilogue + opmath_t beta_val = bias ? 0 : 1; // bias is added in epilogue unless nullptr cudaDataType_t abType = CUDA_R_32F; cudaDataType_t cType = CUDA_R_32F; @@ -1661,15 +1686,22 @@ bool gemm_and_bias( _syncCurrentWithCarveoutStream(stream, true); } #endif - cublasLtEpilogue_t epilogue = CUBLASLT_EPILOGUE_BIAS; - if (activation == GEMMAndBiasActivationEpilogue::RELU) { - epilogue = CUBLASLT_EPILOGUE_RELU_BIAS; - } else if (activation == GEMMAndBiasActivationEpilogue::GELU) { - epilogue = CUBLASLT_EPILOGUE_GELU_BIAS; - } + const auto epilogue = [&]() -> cublasLtEpilogue_t { + // The cuBLAS documentation indicates that + // *__BIAS = *_, + // but we keep it verbose here for clarity. + switch (activation) { + case GEMMAndBiasActivationEpilogue::RELU: + return bias ? CUBLASLT_EPILOGUE_RELU_BIAS : CUBLASLT_EPILOGUE_RELU; + case GEMMAndBiasActivationEpilogue::GELU: + return bias ? CUBLASLT_EPILOGUE_GELU_BIAS : CUBLASLT_EPILOGUE_GELU; + default: + return bias ? CUBLASLT_EPILOGUE_BIAS : CUBLASLT_EPILOGUE_DEFAULT; + } + }(); + computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_EPILOGUE, epilogue); - if (bias != nullptr) { - computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_EPILOGUE, epilogue); + if (bias) { computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_BIAS_POINTER, bias); } diff --git a/aten/src/ATen/cuda/CUDAContextLight.h b/aten/src/ATen/cuda/CUDAContextLight.h index 86e960cc1ab4a..01d10f61da692 100644 --- a/aten/src/ATen/cuda/CUDAContextLight.h +++ b/aten/src/ATen/cuda/CUDAContextLight.h @@ -3,6 +3,7 @@ #include #include +#include #include #include @@ -88,8 +89,13 @@ TORCH_CUDA_CPP_API cublasHandle_t getCurrentCUDABlasHandle(); TORCH_CUDA_CPP_API cublasLtHandle_t getCurrentCUDABlasLtHandle(); TORCH_CUDA_CPP_API void clearCublasWorkspaces(); -TORCH_CUDA_CPP_API std::map, at::DataPtr>& cublas_handle_stream_to_workspace(); -TORCH_CUDA_CPP_API std::map, at::DataPtr>& cublaslt_handle_stream_to_workspace(); +struct WorkspaceMapWithMutex { + std::map, at::DataPtr> map; + std::shared_mutex mutex; +}; + +TORCH_CUDA_CPP_API WorkspaceMapWithMutex& cublas_handle_stream_to_workspace(); +TORCH_CUDA_CPP_API WorkspaceMapWithMutex& cublaslt_handle_stream_to_workspace(); TORCH_CUDA_CPP_API size_t getChosenWorkspaceSize(); TORCH_CUDA_CPP_API size_t getCUDABlasLtWorkspaceSize(); TORCH_CUDA_CPP_API void* getCUDABlasLtWorkspace(); diff --git a/aten/src/ATen/cuda/CUDAGraph.cpp b/aten/src/ATen/cuda/CUDAGraph.cpp index 31d2d3f1fe589..de04bfdc691cc 100644 --- a/aten/src/ATen/cuda/CUDAGraph.cpp +++ b/aten/src/ATen/cuda/CUDAGraph.cpp @@ -1,6 +1,7 @@ #include #include #include +#include #include #include @@ -13,7 +14,7 @@ static bool _cuda_graphs_debug = false; MempoolId_t graph_pool_handle() { // Sets just the second value, to distinguish it from MempoolId_ts created from // cudaStreamGetCaptureInfo id_s in capture_begin. - return c10::cuda::MemPool::graph_pool_handle(); + return at::cuda::MemPool::graph_pool_handle(); } /** @@ -90,7 +91,7 @@ void CUDAGraph::capture_begin(MempoolId_t pool/*=0*/, cudaStreamCaptureMode capt } else { // User did not ask us to share a mempool. Create graph pool handle using is_user_created=false. // Sets just the first value, to distinguish it from MempoolId_ts created by graph_pool_handle(). - mempool_id_ = c10::cuda::MemPool::graph_pool_handle(false); + mempool_id_ = at::cuda::MemPool::graph_pool_handle(false); TORCH_INTERNAL_ASSERT(mempool_id_.first > 0); } diff --git a/aten/src/ATen/cuda/CublasHandlePool.cpp b/aten/src/ATen/cuda/CublasHandlePool.cpp index 6175e69827e2f..65c042723f90b 100644 --- a/aten/src/ATen/cuda/CublasHandlePool.cpp +++ b/aten/src/ATen/cuda/CublasHandlePool.cpp @@ -99,7 +99,7 @@ void destroyCublasHandle(cublasHandle_t handle) { // - Comments of @soumith copied from cuDNN handle pool implementation #ifdef NO_CUDNN_DESTROY_HANDLE #else - cublasDestroy(handle); + cublasDestroy(handle); #endif } @@ -107,19 +107,27 @@ using CuBlasPoolType = DeviceThreadHandlePool, at::DataPtr>& cublas_handle_stream_to_workspace() { - static auto& instance = *new std::map, at::DataPtr>; +WorkspaceMapWithMutex& cublas_handle_stream_to_workspace() { + static auto& instance = *new WorkspaceMapWithMutex; return instance; } -std::map, at::DataPtr>& cublaslt_handle_stream_to_workspace() { - static auto& instance = *new std::map, at::DataPtr>; +WorkspaceMapWithMutex& cublaslt_handle_stream_to_workspace() { + static auto& instance = *new WorkspaceMapWithMutex; return instance; } void clearCublasWorkspaces() { - cublas_handle_stream_to_workspace().clear(); - cublaslt_handle_stream_to_workspace().clear(); + { + auto& workspace = cublas_handle_stream_to_workspace(); + std::unique_lock lock(workspace.mutex); + workspace.map.clear(); + } + { + auto& workspace = cublaslt_handle_stream_to_workspace(); + std::unique_lock lock(workspace.mutex); + workspace.map.clear(); + } } size_t parseChosenWorkspaceSize() { @@ -241,8 +249,10 @@ void* getCUDABlasLtWorkspace() { auto stream = c10::cuda::getCurrentCUDAStream(); cudaStream_t _stream = stream; auto key = std::make_tuple(static_cast(handle), static_cast(_stream)); - auto workspace_it = at::cuda::cublas_handle_stream_to_workspace().find(key); - TORCH_INTERNAL_ASSERT(workspace_it != at::cuda::cublas_handle_stream_to_workspace().end()); + auto& workspace = at::cuda::cublas_handle_stream_to_workspace(); + std::shared_lock lock(workspace.mutex); + auto workspace_it = workspace.map.find(key); + TORCH_INTERNAL_ASSERT(workspace_it != workspace.map.end()); return workspace_it->second.mutable_get(); } #endif @@ -250,11 +260,34 @@ void* getCUDABlasLtWorkspace() { auto stream = c10::cuda::getCurrentCUDAStream(); cudaStream_t _stream = stream; auto key = std::make_tuple(static_cast(handle), static_cast(_stream)); - auto workspace_it = cublaslt_handle_stream_to_workspace().find(key); - if (workspace_it == cublaslt_handle_stream_to_workspace().end()) { - workspace_it = cublaslt_handle_stream_to_workspace().insert(workspace_it, {key, getNewCUDABlasLtWorkspace()}); + + auto& workspace = cublaslt_handle_stream_to_workspace(); + + // Fast path: check if workspace already exists + { + std::shared_lock lock(workspace.mutex); + auto workspace_it = workspace.map.find(key); + if (workspace_it != workspace.map.end()) { + return workspace_it->second.mutable_get(); + } + } + + // Slow path: allocate workspace outside the lock + auto new_workspace = getNewCUDABlasLtWorkspace(); + + // Insert with lock (double-check in case another thread inserted while we + // were allocating) + { + std::unique_lock lock(workspace.mutex); + auto workspace_it = workspace.map.find(key); + if (workspace_it == workspace.map.end()) { + workspace_it = + workspace.map.emplace(key, std::move(new_workspace)).first; + } + // else: another thread inserted it, our new_workspace will be automatically + // freed + return workspace_it->second.mutable_get(); } - return workspace_it->second.mutable_get(); } cublasHandle_t getCurrentCUDABlasHandle() { @@ -300,11 +333,39 @@ cublasHandle_t getCurrentCUDABlasHandle() { // all the memory and cublas's cudaMallocAsync will return OOM cudaStream_t _stream = stream; auto key = std::make_tuple(static_cast(handle), static_cast(_stream)); - auto workspace_it = cublas_handle_stream_to_workspace().find(key); - if (workspace_it == cublas_handle_stream_to_workspace().end()) { - workspace_it = cublas_handle_stream_to_workspace().insert(workspace_it, {key, getNewWorkspace()}); + + auto& workspace = cublas_handle_stream_to_workspace(); + + size_t workspace_size = getChosenWorkspaceSize(); + + // Fast path: check if workspace already exists + { + std::shared_lock lock(workspace.mutex); + auto workspace_it = workspace.map.find(key); + if (workspace_it != workspace.map.end()) { + TORCH_CUDABLAS_CHECK(cublasSetWorkspace( + handle, workspace_it->second.get(), workspace_size)); + return handle; + } + } + + // Slow path: allocate workspace outside the lock + auto new_workspace = getNewWorkspace(); + + // Insert with lock (double-check in case another thread inserted while we + // were allocating) + { + std::unique_lock lock(workspace.mutex); + auto workspace_it = workspace.map.find(key); + if (workspace_it == workspace.map.end()) { + workspace_it = + workspace.map.emplace(key, std::move(new_workspace)).first; + } + // else: another thread inserted it, our new_workspace will be automatically + // freed + TORCH_CUDABLAS_CHECK( + cublasSetWorkspace(handle, workspace_it->second.get(), workspace_size)); } - TORCH_CUDABLAS_CHECK(cublasSetWorkspace(handle, workspace_it->second.get(), getChosenWorkspaceSize())); #if !defined(USE_ROCM) // On CUDA >= 11, and architecture >= Ampere, cuBLAS can use TF32 to speedup // FP32 data type calculations based on the value of the allow_tf32 flag. diff --git a/aten/src/ATen/cuda/MemPool.cpp b/aten/src/ATen/cuda/MemPool.cpp new file mode 100644 index 0000000000000..99405965898e0 --- /dev/null +++ b/aten/src/ATen/cuda/MemPool.cpp @@ -0,0 +1,69 @@ +#include +#include + +namespace at::cuda { + +// uid_ is incremented when a user creates a MemPool, +// for example: using graph_pool_handle() or c10::cuda::MemPool(). +// +// uuid_ is incremented when CUDAGraph creates a MemPool +// as a result of a user not providing a pool. +// +// MempoolId_t of {0, 0} is used to denote when no MemPool has been +// passed to a function, either by user or CUDAGraphs. For example, +// default value of MempoolId_t for capture_begin function is {0, 0}. +// That's why uid_ and uuid_ start at 1. +std::atomic MemPool::uid_{1}; +std::atomic MemPool::uuid_{1}; + +MemPool::MemPool( + CUDACachingAllocator::CUDAAllocator* allocator, + bool is_user_created, + bool use_on_oom) + : allocator_(allocator), is_user_created_(is_user_created) { + if (is_user_created_) { + id_ = {0, uid_++}; + } else { + id_ = {uuid_++, 0}; + } + device_ = c10::cuda::current_device(); + CUDACachingAllocator::createOrIncrefPool(device_, id_, allocator); + if (use_on_oom) { + CUDACachingAllocator::setUseOnOOM(device_, id_); + } +} + +MemPool::~MemPool() { + // TORCH_INTERNAL_ASSERT(use_count() == 1); + // We used to assert that TORCH_INTERNAL_ASSERT(use_count() == 1); + // However, this assertion is not true if a memory pool is shared + // with a cuda graph. That CUDAGraph will increase the use count + // until it is reset. + CUDACachingAllocator::releasePool(device_, id_); + c10::cuda::CUDACachingAllocator::emptyCache(id_); +} + +MempoolId_t MemPool::id() { + return id_; +} + +CUDACachingAllocator::CUDAAllocator* MemPool::allocator() { + return allocator_; +} + +int MemPool::use_count() { + return CUDACachingAllocator::getPoolUseCount(device_, id_); +} + +c10::DeviceIndex MemPool::device() { + return device_; +} + +MempoolId_t MemPool::graph_pool_handle(bool is_user_created) { + if (is_user_created) { + return {0, uid_++}; + } + return {uuid_++, 0}; +} + +} // namespace at::cuda diff --git a/aten/src/ATen/cuda/MemPool.h b/aten/src/ATen/cuda/MemPool.h new file mode 100644 index 0000000000000..ba281c96b7043 --- /dev/null +++ b/aten/src/ATen/cuda/MemPool.h @@ -0,0 +1,44 @@ +#pragma once + +#include +#include + +namespace at::cuda { + +// Keep BC only +using c10::CaptureId_t; +using c10::MempoolId_t; + +// MemPool represents a pool of memory in a caching allocator. Currently, +// it's just the ID of the pool object maintained in the CUDACachingAllocator. +// +// An allocator pointer can be passed to the MemPool to define how the +// allocations should be done in the pool. For example: using a different +// system allocator such as ncclMemAlloc. +struct TORCH_CUDA_CPP_API MemPool { + MemPool( + c10::cuda::CUDACachingAllocator::CUDAAllocator* allocator = nullptr, + bool is_user_created = true, + bool use_on_oom = false); + MemPool(const MemPool&) = delete; + MemPool(MemPool&&) = default; + MemPool& operator=(const MemPool&) = delete; + MemPool& operator=(MemPool&&) = default; + ~MemPool(); + + MempoolId_t id(); + c10::cuda::CUDACachingAllocator::CUDAAllocator* allocator(); + int use_count(); + c10::DeviceIndex device(); + static MempoolId_t graph_pool_handle(bool is_user_created = true); + + private: + static std::atomic uid_; + static std::atomic uuid_; + c10::cuda::CUDACachingAllocator::CUDAAllocator* allocator_; + bool is_user_created_; + MempoolId_t id_; + c10::DeviceIndex device_; +}; + +} // namespace at::cuda diff --git a/aten/src/ATen/cuda/cub.h b/aten/src/ATen/cuda/cub.h index 7430edaf8a3dc..bca9b1faff523 100644 --- a/aten/src/ATen/cuda/cub.h +++ b/aten/src/ATen/cuda/cub.h @@ -24,7 +24,13 @@ namespace detail { // radix_sort_pairs doesn't interact with value_t other than to copy // the data, so we can save template instantiations by reinterpreting // it as an opaque type. +// We use native integer types for 1/2/4/8-byte values to reduce +// register usage in CUDA kernels. For sizes > 8 fall back to char array. template struct alignas(N) OpaqueType { char data[N]; }; +template <> struct alignas(1) OpaqueType<1> { uint8_t data; }; +template <> struct alignas(2) OpaqueType<2> { uint16_t data; }; +template <> struct alignas(4) OpaqueType<4> { uint32_t data; }; +template <> struct alignas(8) OpaqueType<8> { uint64_t data; }; template void radix_sort_pairs_impl( diff --git a/aten/src/ATen/cuda/detail/CUDAHooks.cpp b/aten/src/ATen/cuda/detail/CUDAHooks.cpp index b7f80101d926e..594045a1b41d2 100644 --- a/aten/src/ATen/cuda/detail/CUDAHooks.cpp +++ b/aten/src/ATen/cuda/detail/CUDAHooks.cpp @@ -21,6 +21,7 @@ #if AT_CUDNN_ENABLED() #include +#include #endif #if AT_MAGMA_ENABLED() @@ -351,6 +352,26 @@ long CUDAHooks::versionCuDNN() const { #endif } +long CUDAHooks::versionRuntimeCuDNN() const { +#if AT_CUDNN_ENABLED() +#ifndef USE_STATIC_CUDNN + return cudnnGetVersion(); +#else + return CUDNN_VERSION; +#endif +#else + TORCH_CHECK(false, "Cannot query CuDNN version if ATen_cuda is not built with CuDNN"); +#endif +} + +long CUDAHooks::versionCuDNNFrontend() const { +#if AT_CUDNN_ENABLED() + return CUDNN_FRONTEND_VERSION; +#else + TORCH_CHECK(false, "Cannot query CuDNN Frontend version if ATen_cuda is not built with CuDNN"); +#endif +} + long CUDAHooks::versionMIOpen() const { #if AT_ROCM_ENABLED() return MIOPEN_VERSION_MAJOR * 10000 + diff --git a/aten/src/ATen/cuda/detail/CUDAHooks.h b/aten/src/ATen/cuda/detail/CUDAHooks.h index 8d3d1db003928..8902c68d342f8 100644 --- a/aten/src/ATen/cuda/detail/CUDAHooks.h +++ b/aten/src/ATen/cuda/detail/CUDAHooks.h @@ -49,6 +49,8 @@ struct CUDAHooks : public at::CUDAHooksInterface { bool hasCUDART() const override; long versionCUDART() const override; long versionCuDNN() const override; + long versionRuntimeCuDNN() const override; + long versionCuDNNFrontend() const override; long versionMIOpen() const override; std::string showConfig() const override; double batchnormMinEpsilonCuDNN() const override; diff --git a/aten/src/ATen/detail/CUDAHooksInterface.h b/aten/src/ATen/detail/CUDAHooksInterface.h index f1f2056917472..0ab8e82a30166 100644 --- a/aten/src/ATen/detail/CUDAHooksInterface.h +++ b/aten/src/ATen/detail/CUDAHooksInterface.h @@ -174,6 +174,14 @@ struct TORCH_API CUDAHooksInterface : AcceleratorHooksInterface { TORCH_CHECK(false, "Cannot query cuDNN version without ATen_cuda library. ", CUDA_HELP); } + virtual long versionRuntimeCuDNN() const { + TORCH_CHECK(false, "Cannot query cuDNN version without ATen_cuda library. ", CUDA_HELP); + } + + virtual long versionCuDNNFrontend() const { + TORCH_CHECK(false, "Cannot query cuDNN Frontend version without ATen_cuda library. ", CUDA_HELP); + } + virtual long versionMIOpen() const { TORCH_CHECK(false, "Cannot query MIOpen version without ATen_cuda library. ", CUDA_HELP); } diff --git a/aten/src/ATen/functorch/BatchedTensorImpl.h b/aten/src/ATen/functorch/BatchedTensorImpl.h index 985b289b3fe02..14be24d63e65a 100644 --- a/aten/src/ATen/functorch/BatchedTensorImpl.h +++ b/aten/src/ATen/functorch/BatchedTensorImpl.h @@ -157,6 +157,8 @@ constexpr DispatchKeySet kKeysToPropagateToWrapper({ DispatchKey::Negative, DispatchKey::Conjugate, DispatchKey::XLA, + DispatchKey::XPU, + DispatchKey::HPU, DispatchKey::CUDA, DispatchKey::CPU, DispatchKey::PrivateUse1, diff --git a/aten/src/ATen/mps/MPSAllocator.mm b/aten/src/ATen/mps/MPSAllocator.mm index c8b3453fc81dd..dfdd67c8f4458 100644 --- a/aten/src/ATen/mps/MPSAllocator.mm +++ b/aten/src/ATen/mps/MPSAllocator.mm @@ -440,7 +440,7 @@ // we need to release the lock temporarily as synchronizing may cause deadlock with completion handlers. m_mutex.unlock(); auto stream = getDefaultMPSStream(); - dispatch_sync(stream->queue(), ^() { + dispatch_sync_with_rethrow(stream->queue(), ^() { stream->synchronize(SyncType::COMMIT_AND_WAIT); }); m_mutex.lock(); diff --git a/aten/src/ATen/mps/MPSStream.h b/aten/src/ATen/mps/MPSStream.h index 10627cfc36b80..b00890b9f5901 100644 --- a/aten/src/ATen/mps/MPSStream.h +++ b/aten/src/ATen/mps/MPSStream.h @@ -110,6 +110,9 @@ class TORCH_API MPSStream { return _stream; } + MTLBuffer_t getErrorBuffer(); + void checkLastError(); + private: Stream _stream; MTLCommandQueue_t _commandQueue = nil; @@ -121,6 +124,8 @@ class TORCH_API MPSStream { dispatch_queue_t _serialQueue = nullptr; // CommitAndContinue is enabled by default bool _enableCommitAndContinue = true; + // Buffer that contains last raised error + MTLBuffer_t _errorBuffer = nil; // use synchronize() to access any of these commit functions outside MPSStream void commit(); @@ -155,4 +160,7 @@ class TORCH_API MPSStreamImpl { MPSStreamImpl(); }; +#ifdef __OBJC__ +void dispatch_sync_with_rethrow(dispatch_queue_t queue, void (^block)()); +#endif } // namespace at::mps diff --git a/aten/src/ATen/mps/MPSStream.mm b/aten/src/ATen/mps/MPSStream.mm index 595d71aeef15a..2150c21c18d75 100644 --- a/aten/src/ATen/mps/MPSStream.mm +++ b/aten/src/ATen/mps/MPSStream.mm @@ -3,13 +3,13 @@ #include #include #include +#include @interface MPSGraphExecutionDescriptor () @property(readwrite, atomic) BOOL enableCommitAndContinue; @end namespace at::mps { - //----------------------------------------------------------------- // MPSStream //----------------------------------------------------------------- @@ -30,6 +30,10 @@ @interface MPSGraphExecutionDescriptor () // Choose level which optimizes for GPU _compilationDescriptor.optimizationLevel = MPSGraphOptimizationLevel0; _executionDescriptor.compilationDescriptor = _compilationDescriptor; + + _errorBuffer = [MPSDevice::getInstance()->device() newBufferWithLength:sizeof(c10::metal::ErrorMessages) + options:MTLResourceStorageModeShared]; + std::memset([_errorBuffer contents], 0, 1024); } MPSStream::~MPSStream() { @@ -38,6 +42,8 @@ @interface MPSGraphExecutionDescriptor () [_executionDescriptor release]; [_compilationDescriptor release]; _executionDescriptor = nil; + [_errorBuffer release]; + _errorBuffer = nil; _compilationDescriptor = nil; assert(_commandBuffer == nil); @@ -104,6 +110,7 @@ @interface MPSGraphExecutionDescriptor () [_prevCommandBuffer waitUntilCompleted]; [_prevCommandBuffer release]; _prevCommandBuffer = nil; + checkLastError(); } if (_commandBuffer) { @@ -111,6 +118,7 @@ @interface MPSGraphExecutionDescriptor () [_commandBuffer waitUntilCompleted]; [_commandBuffer release]; _commandBuffer = nil; + checkLastError(); } } @@ -153,7 +161,7 @@ @interface MPSGraphExecutionDescriptor () if (length == 0) { return; } - dispatch_sync(_serialQueue, ^() { + dispatch_sync_with_rethrow(_serialQueue, ^() { @autoreleasepool { endKernelCoalescing(); id blitEncoder = [commandBuffer() blitCommandEncoder]; @@ -183,7 +191,7 @@ @interface MPSGraphExecutionDescriptor () size_t dstOffset, uint64_t profileId, SyncType syncType) { - dispatch_sync(_serialQueue, ^() { + dispatch_sync_with_rethrow(_serialQueue, ^() { @autoreleasepool { endKernelCoalescing(); id blitEncoder = [commandBuffer() blitCommandEncoder]; @@ -236,7 +244,7 @@ @interface MPSGraphExecutionDescriptor () auto& profiler = getMPSProfiler(); const bool isGraphProfilingEnabled = profiler.isOperationProfilingEnabled(); - dispatch_sync(_serialQueue, ^() { + dispatch_sync_with_rethrow(_serialQueue, ^() { endKernelCoalescing(); if (isGraphProfilingEnabled) { // this function call is only relevant for interval-based Signposts @@ -266,6 +274,24 @@ @interface MPSGraphExecutionDescriptor () }); } +id MPSStream::getErrorBuffer() { + return _errorBuffer; +} + +void MPSStream::checkLastError() { + auto msgs = reinterpret_cast([_errorBuffer contents]); + const auto& msg = msgs->msg[0]; + if (!msgs) { + return; + } + unsigned int count = 0; + std::swap(count, msgs->count); + if (!count) { + return; + } + throw c10::AcceleratorError({msg.func, msg.file, msg.line}, 1, msg.message); +} + //----------------------------------------------------------------- // MPSStreamImpl //----------------------------------------------------------------- @@ -289,4 +315,19 @@ @interface MPSGraphExecutionDescriptor () return MPSStreamImpl::getInstance(); } +// Helper methods +void dispatch_sync_with_rethrow(dispatch_queue_t queue, void (^block)()) { + __block std::optional block_exception; + dispatch_sync(queue, ^() { + try { + block(); + } catch (...) { + block_exception = std::current_exception(); + } + }); + if (block_exception) { + std::rethrow_exception(*block_exception); + } +} + } // namespace at::mps diff --git a/aten/src/ATen/native/BinaryOps.cpp b/aten/src/ATen/native/BinaryOps.cpp index f5d5edb6439a6..2fa6bcc6dc9ac 100644 --- a/aten/src/ATen/native/BinaryOps.cpp +++ b/aten/src/ATen/native/BinaryOps.cpp @@ -1009,12 +1009,25 @@ static Device correct_out_device(const Tensor& self, const Tensor& other) { } } +static Tensor send_to_meta(const Tensor& self, const Device& device) { + Tensor out_meta; + if (self._is_zerotensor() && self.unsafeGetTensorImpl()->is_wrapped_number()) { + out_meta = at::_efficientzerotensor(self.sizes(), self.options().device(device)); + out_meta.unsafeGetTensorImpl()->set_wrapped_number(true); + } else { + out_meta = self.to(device); + } + return out_meta; +} + Tensor mul_zerotensor(const Tensor& self, const Tensor& other) { auto out_device = correct_out_device(self, other); // hack to use the TensorIterator to get the correct broadcasting and type promotion logic auto device_ = Device(DeviceType::Meta); constexpr c10::DispatchKeySet meta_dks(at::DispatchKey::Meta); - auto meta_out = at::_ops::mul_Tensor::redispatch(meta_dks, self.to(device_), other.to(device_)); + auto self_meta = send_to_meta(self, device_); + auto other_meta = send_to_meta(other, device_); + auto meta_out = at::_ops::mul_Tensor::redispatch(meta_dks, self_meta, other_meta); return at::_efficientzerotensor(meta_out.sizes(), meta_out.options().device(out_device)); } @@ -1023,7 +1036,9 @@ Tensor div_zerotensor(const Tensor& self, const Tensor& other) { // hack to use the TensorIterator to get the correct broadcasting and type promotion logic auto device_ = Device(DeviceType::Meta); constexpr c10::DispatchKeySet meta_dks(at::DispatchKey::Meta); - auto meta_out = at::_ops::div_Tensor::redispatch(meta_dks, self.to(device_), other.to(device_)); + auto self_meta = send_to_meta(self, device_); + auto other_meta = send_to_meta(other, device_); + auto meta_out = at::_ops::div_Tensor::redispatch(meta_dks, self_meta, other_meta); if (self._is_zerotensor()) { if (other._is_zerotensor()) { @@ -1052,8 +1067,9 @@ static Tensor maybe_add_maybe_sub(const Tensor& self, const Tensor& other, const // hack to use the TensorIterator to get the correct broadcasting and type promotion logic auto device_ = Device(DeviceType::Meta); constexpr c10::DispatchKeySet meta_dks(at::DispatchKey::Meta); - auto meta_out = at::_ops::add_Tensor::redispatch( - meta_dks, self.to(device_), other.to(device_), alpha); + auto self_meta = send_to_meta(self, device_); + auto other_meta = send_to_meta(other, device_); + auto meta_out = at::_ops::add_Tensor::redispatch(meta_dks, self_meta, other_meta, alpha); auto get_out_like = [&] (const Tensor& tensor) { diff --git a/aten/src/ATen/native/Convolution.cpp b/aten/src/ATen/native/Convolution.cpp index 2c3f14aab911c..ca3a4f5f3faba 100644 --- a/aten/src/ATen/native/Convolution.cpp +++ b/aten/src/ATen/native/Convolution.cpp @@ -409,7 +409,7 @@ struct ConvParams { if (!detail::getCUDAHooks().compiledWithCuDNN() || !input.is_cuda() || !cudnn_enabled) { return false; } - static long cudnn_version = detail::getCUDAHooks().versionCuDNN(); + static long cudnn_version = detail::getCUDAHooks().versionRuntimeCuDNN(); // broken on cuDNN 9.8 - 9.14 if (cudnn_version >= 90800 && cudnn_version < 91500) { if (cudnn_conv_suggest_memory_format(input, weight) == at::MemoryFormat::Contiguous && @@ -453,7 +453,7 @@ struct ConvParams { } // native kernel doesn't support 64-bit non-splittable case if (!(canUse32BitIndexMath(input) && canUse32BitIndexMath(weight))) { - static long cudnn_version = detail::getCUDAHooks().compiledWithCuDNN() ? detail::getCUDAHooks().versionCuDNN() : -1; + static long cudnn_version = detail::getCUDAHooks().compiledWithCuDNN() ? detail::getCUDAHooks().versionRuntimeCuDNN() : -1; // TODO(eqy): remove this once cuDNN fixes 64-bit depthwise support, first broken in 9.11x if (cudnn_conv_suggest_memory_format(input, weight) != at::MemoryFormat::Contiguous) { if (cudnn_version < 0 || cudnn_version > 91000) { diff --git a/aten/src/ATen/native/Linear.cpp b/aten/src/ATen/native/Linear.cpp index 1da245972f0cb..fbabba84dbb2d 100644 --- a/aten/src/ATen/native/Linear.cpp +++ b/aten/src/ATen/native/Linear.cpp @@ -50,18 +50,35 @@ static inline bool parseLinearFlatten3d() { // `_flatten_nd_linear` flattens all but the last dimension of the input tensor // before passing it to linear operation static inline Tensor _flatten_nd_linear(const Tensor& input, const Tensor& weight, const Tensor& bias) { - const auto input_sizes = input.sym_sizes(); - // can't use -1 in reshape because it errors when a dimension is 0 - c10::SymInt flattened_dim = 1; - for (int64_t i = 0, ndim = input_sizes.size(); i < ndim - 1; ++i) { - flattened_dim = flattened_dim * input_sizes[i]; + const auto input_sizes = input.sym_sizes(); + + const auto result_flattened = [&]() -> Tensor { + const auto input_ncols = input_sizes.back(); + const auto input_flattened_nrows = [&]() -> c10::SymInt { + // can't use -1 in reshape because it errors when a dimension is 0 + auto flattened_nrows = c10::SymInt{1}; + for (const auto& size : input_sizes.slice(0, input_sizes.size() - 1)) { + flattened_nrows *= size; + } + return flattened_nrows; + }(); + + const auto input_flattened = input.view_symint({input_flattened_nrows, input_ncols}); + if (weight.layout() == c10::kStrided) { + return at::addmm(bias, input_flattened, weight.t()); + } else { + // weight is sparse, and addmm for sparse expects matmul lhs to be sparse, + // so we transpose the problem. + // NOTE: at::matmul handles (dense @ sparse) similarly. + const auto bias_t = (bias.dim() >= 2) ? bias.mT() : bias.unsqueeze(-1); + return at::addmm(bias_t, weight, input_flattened.t()).t(); } - auto inp_reshape = input.reshape_symint({flattened_dim, input_sizes.at(input_sizes.size() -1)}); - const auto result = at::addmm(bias, inp_reshape, weight.t()); - auto new_size = input_sizes.slice(0, input_sizes.size() - 1); - c10::SymDimVector sizes_vec(new_size.begin(), new_size.end()); - sizes_vec.push_back(result.sym_size(1)); - return result.view_symint(sizes_vec); + }(); + + // Unflatten flattened row dims + auto result_sizes = c10::SymDimVector{input_sizes.begin(), input_sizes.end()}; + result_sizes.back() = result_flattened.sym_size(1); + return result_flattened.view_symint(result_sizes); } @@ -90,15 +107,23 @@ Tensor linear(const Tensor& input, const Tensor& weight, const std::optionaldefined() && !input.is_xla()) { - // Also hit the fused path for contiguous 3D input, if not using xla + + const auto is_bias_likely_fusable = ( + bias->defined() && + // cuBLASLt: will fuse in the epilogue without copies + // when input/weight/bias are all strided. + // When weight is not strided, bias will not be fused, + // but we can still dispatch here to avoid at::matmul + // path which will probably use a very similar + // flattening optimization. + ((bias->dim() == 1 || bias->squeeze().dim() == 1) && bias->is_contiguous_or_false()) + ); + if (is_bias_likely_fusable && !input.is_xla()) { + // Also hit the fused path for contiguous nD input, if not using xla // backend. Reshaping/flattening has some performance implications on xla. - bool is_contiguous = input.is_contiguous_or_false(); - if (is_contiguous && input_dim == 3) { - return _flatten_nd_linear(input, weight, *bias); - } else if (is_contiguous && input.layout() == c10::kStrided && weight.layout() == c10::kStrided && bias->dim() == 1) { + if (input.is_contiguous_or_false()) { return _flatten_nd_linear(input, weight, *bias); - } else if (parseLinearFlatten3d() && input_dim == 3) { + } else if (parseLinearFlatten3d()) { // If user forces flattening via env var const Tensor input_cont = input.contiguous(); return _flatten_nd_linear(input_cont, weight, *bias); diff --git a/aten/src/ATen/native/LinearAlgebra.cpp b/aten/src/ATen/native/LinearAlgebra.cpp index 07bdc19ec8ff7..846adb2be3c6e 100644 --- a/aten/src/ATen/native/LinearAlgebra.cpp +++ b/aten/src/ATen/native/LinearAlgebra.cpp @@ -1936,7 +1936,7 @@ static bool should_fold(const Tensor& tensor1, const Tensor& tensor2, bool has_o // We order the tensors. t1 will be the larger tensor // We can always transpose tensor2 as the dimensions are always >= 1 (precondition from matmul) - // and tensor1_larger iff tensor2.dim() > tensor1.dim(9 + // and tensor1_larger iff tensor2.dim() > tensor1.dim() const auto t1 = tensor1_larger ? MaybeOwned::borrowed(tensor1) : MaybeOwned::owned(tensor2.mT()); const int64_t dim_t1 = t1->dim(); @@ -1948,20 +1948,11 @@ static bool should_fold(const Tensor& tensor1, const Tensor& tensor2, bool has_o return false; } - // In this case we *do* incur in an extra copy to avoid creating an unnecessary large tensor in the backward - // Suppose we don't fold here. Let t1.shape = [b, m, n] t2.shape = [n, k] like in a transformer - // t2 will be expanded to a tensor of shape [b, n, k] and then we do t1.bmm(t2_expanded) - // The issue appears in the backward. - // The output gradient g of this operation would have shape [b, m, k] - // The backward wrt. t2 of bmm would be given by t1.mH @ g, which has shape [b, n, k] - // Then, the backward of expand is simply `sum(0)`. As such, we are instantiating a tensor - // of shape [b, n, k] unnecessarily, which may cause a large memory footprint, and in the - // worst case, an OOM - bool t2_requires_grad = tensor1_larger ? tensor2.requires_grad() : tensor1.requires_grad(); - if (t2_requires_grad && !has_out) { - // We should be checking !at::GradMode::is_enabled(), but apparently - // this regresses performance in some cases: - // https://github.com/pytorch/pytorch/issues/118548#issuecomment-1916022394 + // If we require a gradient, we should fold to minimize backward memory usage - even if this + // leads to a copy in forward because is needed in backward, + // only time we avoid this strict pre-allocated memory usage (has_out = True) + bool requires_grad = tensor1.requires_grad() || tensor2.requires_grad(); + if (requires_grad && !has_out) { return true; } diff --git a/aten/src/ATen/native/PackedSequence.cpp b/aten/src/ATen/native/PackedSequence.cpp index d069108348d24..be7961b2a2452 100644 --- a/aten/src/ATen/native/PackedSequence.cpp +++ b/aten/src/ATen/native/PackedSequence.cpp @@ -142,6 +142,7 @@ Tensor _pack_padded_sequence_backward_symint(const Tensor& grad, c10::SymIntArra std::tuple _pad_packed_sequence(const Tensor& data, const Tensor& _batch_sizes, bool batch_first, const Scalar& padding_value, int64_t total_length) { auto batch_sizes_t = _batch_sizes.contiguous(); checkLongTensor(batch_sizes_t); + TORCH_CHECK(batch_sizes_t.numel() > 0, "batch_sizes can not be empty"); int64_t * batch_sizes = batch_sizes_t.data_ptr(); int64_t max_batch_size = batch_sizes[0]; diff --git a/aten/src/ATen/native/TensorCompare.cpp b/aten/src/ATen/native/TensorCompare.cpp index c6126eda61e73..c086c7db31404 100644 --- a/aten/src/ATen/native/TensorCompare.cpp +++ b/aten/src/ATen/native/TensorCompare.cpp @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include @@ -479,6 +480,14 @@ Tensor isfinite(const Tensor& self) { }); } +void _async_error(std::string_view msg) { + TORCH_CHECK(0, msg); +} + +void _async_error_meta(std::string_view msg) { + // Do NOT error, it's an async error! +} + void _assert_async_cpu(const Tensor& self) { TORCH_CHECK( native::is_nonzero(self), diff --git a/aten/src/ATen/native/TensorShape.cpp b/aten/src/ATen/native/TensorShape.cpp index 6df7761d822db..0079a530b3d0e 100644 --- a/aten/src/ATen/native/TensorShape.cpp +++ b/aten/src/ATen/native/TensorShape.cpp @@ -1,5 +1,6 @@ #include #include +#include #define TORCH_ASSERT_ONLY_METHOD_OPERATORS #include #include @@ -1710,11 +1711,37 @@ Tensor narrow_symint( "], but got ", start, ")") - if (start < 0) { - start = start + cur_size; - } + + auto cond1 = TORCH_GUARD_OR_FALSE(start.sym_lt(0)); + auto cond2 = TORCH_GUARD_OR_FALSE(start.sym_ge(0)); + + if (cond1 || cond2) { + if (cond1) { + start = start + cur_size; + } + + TORCH_SYM_CHECK( + start.sym_le(cur_size - length), + "start (", + start, + ") + length (", + length, + ") exceeds dimension size (", + cur_size, + ")."); + return at::slice_symint(self, dim, start, start + length, 1); + } + + // Unbacked start handling! + + // Bounds check without converting start: + // - If start < 0: need (start + cur_size) + length <= cur_size, i.e., start + + // length <= 0 + // - If start >= 0: need start + length <= cur_size + auto end = start + length; TORCH_SYM_CHECK( - start.sym_le(cur_size - length), + (start.sym_lt(0).sym_and((end).sym_le(0))) + .sym_or(start.sym_ge(0).sym_and((end).sym_le(cur_size))), "start (", start, ") + length (", @@ -1722,7 +1749,28 @@ Tensor narrow_symint( ") exceeds dimension size (", cur_size, ")."); - return at::slice_symint(self, dim, start, start + length, 1); + + if (TORCH_GUARD_OR_FALSE(end.sym_ne(0))) { + return at::slice_symint(self, dim, start, end, 1); + } else { + // Cannot statically determine the condition due to unbacked. + // This is an interesting situation; when start is negative and + // start + length == 0, slice and narrow do different things. + // i.e., x.narrow(0, -2, 2) != x[-2:0]; in that case, we want to + // pass curr_size instead of 0. Otherwise, they would do the same thing. + // This says at runtime: if start < 0 and end == 0, then pass curr_size + // instead of 0. + + auto use_different = start.sym_lt(0).sym_and(end.sym_eq(0)).toSymInt(); + auto result = + at::slice_symint(self, dim, start, end + use_different * cur_size, 1); + + // Ensure slice allocated unbacked size is specialized to length. + SymInt new_size = result.sym_size(dim); + TORCH_SYM_CHECK(new_size.sym_eq(length), "") + + return result; + } } // This overload exists purely for XLA, because they wanted to pass in @@ -1736,8 +1784,8 @@ Tensor narrow_tensor_symint( start.dim() == 0 && isIntegralType(start.scalar_type(), /*includeBool=*/false), "start must be an 0-dim integral Tensor."); - int64_t st = start.item(); - return at::narrow_symint(self, dim, c10::SymInt(st), std::move(length)); + c10::SymInt st = start.item().toSymInt(); + return at::narrow_symint(self, dim, std::move(st), std::move(length)); } std:: diff --git a/aten/src/ATen/native/TransposeType.h b/aten/src/ATen/native/TransposeType.h index 603bf6fee60aa..bb63e6d542482 100644 --- a/aten/src/ATen/native/TransposeType.h +++ b/aten/src/ATen/native/TransposeType.h @@ -1,6 +1,8 @@ #pragma once #include +C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wswitch-default") + namespace at::native { // Used as an interface between the different BLAS-like libraries @@ -21,3 +23,5 @@ static inline char to_blas(TransposeType trans) { } } // namespace at::native + +C10_DIAGNOSTIC_POP() diff --git a/aten/src/ATen/native/cpu/GridSamplerKernel.cpp b/aten/src/ATen/native/cpu/GridSamplerKernel.cpp index 7587988528ebb..73f8c136794ce 100644 --- a/aten/src/ATen/native/cpu/GridSamplerKernel.cpp +++ b/aten/src/ATen/native/cpu/GridSamplerKernel.cpp @@ -293,7 +293,7 @@ struct ComputeLocationBase { , empty(size <= 0) {} inline Vec unnormalize(const Vec &in) const { - return (in + Vec(1)) * Vec(scaling_factor) - Vec(0.5); + return (in + Vec(static_cast(1))) * Vec(scaling_factor) - Vec(static_cast(0.5)); } inline Vec clip_coordinates(const Vec &in) const { @@ -831,7 +831,7 @@ struct ApplyGridSample(-0.75)); ApplyGridSample(const TensorAccessor& input) : inp_H(input.size(2)) diff --git a/aten/src/ATen/native/cuda/Blas.cpp b/aten/src/ATen/native/cuda/Blas.cpp index 186f7d8a6a78a..2754d70cac013 100644 --- a/aten/src/ATen/native/cuda/Blas.cpp +++ b/aten/src/ATen/native/cuda/Blas.cpp @@ -147,14 +147,24 @@ static bool isGloballyDisabledAddmmCudaLt(const at::Device& device) { /* * Check whether for the given input we want to enable the Lt interface */ -static bool isInputCompliesAddmmCudaLt(Tensor& result, const Tensor& self, const Tensor& mat1, const Tensor& mat2, const Scalar& beta, const Scalar& alpha) { +static bool isInputCompliesAddmmCudaLt( + Tensor& result, + const Tensor& self, + const Tensor& mat1, + const Tensor& mat2, + const Scalar& beta, + const Scalar& alpha, + Activation activation +) { + #ifdef USE_ROCM // Implies 2D bias which we currently not send through Lt. // TODO: this check is done pre col-major input preparation, // so, this condition can be ralexed in cases when a col-major // copy of result is needed. - if (result.is_same(self)) { + if (self.is_same(result) || self.dim() == 2) { return false; } + #endif #if defined(USE_ROCM) && ROCM_VERSION == 60400 // hipblaslt TT fp32 regression on ROCm 6.4, cannot use @@ -169,13 +179,33 @@ static bool isInputCompliesAddmmCudaLt(Tensor& result, const Tensor& self, const #if defined(CUDA_VERSION) || defined(USE_ROCM) const auto scalar_type = mat1.scalar_type(); return (beta.toComplexDouble() == 1.0 + // NOTE: row-major result is important when bias is 1D. + // This is because Lt broadcasts 1D bias over the columns + // while the aten::addmm API broadcasts it over the rows, + // and this is in conjuction with the data preparation + // procedure that does not transpose arguments with + // col-major result. For col-major result we need + // to explicitly transpose the problem so that bias is + // correctly applied. + // TODO: enable col-major result if needed. + // TODO: no need to check result's layout when + // !result.is_same(self) and self.dim() == 2, because + // self needs to be copied into result and the bias ptr + // will be ignored. && result.dim() == 2 && result.is_contiguous() - // Conditions for bias to be fusable && ( - self.is_contiguous() && - // NOTE: fine to have 1-len dims to the left from the right-most one - (self.dim() == 1 || self.squeeze().dim() == 1) && - self.sizes().back() == mat2_sizes[1] + ( // Conditions for bias to be fusable -- implies direct Lt path without copies. + self.is_contiguous() && + // NOTE: fine to have 1-len dims to the left from the right-most one + (self.dim() == 1 || self.squeeze().dim() == 1) && + self.sizes().back() == mat2_sizes[1] + ) + || ( // 2D bias restrictions. self.is_contiguous() is implicit when result.is_same(self), + // and we need to copy self into result otherwise, so the self's layout becomes irrelevant. + // See also TODO from above. + activation != Activation::None && // Lt is faster when activation is fused + (self.dim() == 2 && at::is_expandable_to(self.sizes(), {mat1_sizes[0], mat2_sizes[1]})) + ) ) && ( // some dtype restrictions #ifndef USE_ROCM @@ -270,7 +300,16 @@ bool launchGemmAndBiasCublasLt( const Scalar& alpha, Activation activation = Activation::None ) { - const auto* self_ptr = self.const_data_ptr(); + // We apply bias in the epilogue only when it is 1D, + // or when it can be squeezed to 1D. + // self_ptr == nullptr implies ignore bias epilogue + // and use standard gemm-like API. + const auto* self_ptr = [&]() -> auto { + if (self.dim() == 1 || self.squeeze().dim() == 1) { + return self.const_data_ptr(); + } + return static_cast(nullptr); + }(); const auto tuning_ctx = at::cuda::tunable::getTuningContext(); if (tuning_ctx->IsTunableOpEnabled()) { @@ -356,7 +395,7 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma disable_addmm_cuda_lt = isGloballyDisabledAddmmCudaLt(self.device()) || disable_addmm_cuda_lt; #endif // Condition on the input - disable_addmm_cuda_lt = !isInputCompliesAddmmCudaLt(result, self, mat1, mat2, beta, alpha) || disable_addmm_cuda_lt; + disable_addmm_cuda_lt = !isInputCompliesAddmmCudaLt(result, self, mat1, mat2, beta, alpha, activation) || disable_addmm_cuda_lt; // } at::ScalarType scalar_type = mat1.scalar_type(); @@ -366,19 +405,20 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma if (!result.is_same(self)) { at::native::resize_output(result, {mat1.sizes()[0], mat2.sizes()[1]}); + // We use bias ptr in the Lt path only when bias is 1D + const auto use_bias_ptr_lt = (self.dim() == 1) && !disable_addmm_cuda_lt; const auto self_maybe_expanded = [&]() -> c10::MaybeOwned { - if (disable_addmm_cuda_lt) { - // When in non-Lt path we do expand self even before + if (!use_bias_ptr_lt) { + // We do expand self even before // check for beta != 0.0 to make sure that // test_sparse_csr.py::TestSparseCSRCUDA::test_addmm_errors_* // runs green. return expand_size(self, result.sizes(), "addmm"); } - // copy next, should broadcast return c10::MaybeOwned::borrowed(self); }(); - // We copy bias when in the non-Lt path - if (beta.toComplexDouble() != 0.0 && disable_addmm_cuda_lt) { + // We do not copy bias only when we need the bias ptr + if (beta.toComplexDouble() != 0.0 && !use_bias_ptr_lt) { // NOTE: self should broadcast over result at::native::copy_(result, *self_maybe_expanded); } diff --git a/aten/src/ATen/native/cuda/CUDALoops.cuh b/aten/src/ATen/native/cuda/CUDALoops.cuh index c42d03b9cbf7f..b83ec3c761e9b 100644 --- a/aten/src/ATen/native/cuda/CUDALoops.cuh +++ b/aten/src/ATen/native/cuda/CUDALoops.cuh @@ -884,6 +884,69 @@ struct type_specialized_kernel_launcher { } }; +template +struct type_specialized_broadcast_kernel_launcher { + template < + typename func_t, + typename array_t, + typename dtypes_t, + typename calc_t> + static void apply( + int64_t numel, + func_t f, + array_t data, + dtypes_t dtypes, + calc_t offset_calc) { + using traits = function_traits; + using ret_t = typename traits::result_type; + using arg0_t = typename traits::template arg<0>::type; + using arg1_t = typename traits::template arg<1>::type; + if (dtypes[0] == rt_binary_specializations[arg_index][0] && + dtypes[1] == rt_binary_specializations[arg_index][1] && + dtypes[2] == rt_binary_specializations[arg_index][2]) { + using ret_cpp_t = c10::impl::ScalarTypeToCPPTypeT; + using arg0_cpp_t = c10::impl::ScalarTypeToCPPTypeT; + using arg1_cpp_t = c10::impl::ScalarTypeToCPPTypeT; + constexpr int grp_sz = 128; + launch_legacy_kernel_manual_unroll(numel, [=] GPU_LAMBDA(int idx, bool unrl) { + if (unrl) { + auto offsets0 = offset_calc.get(idx); + auto offsets1 = offset_calc.get(idx + grp_sz); + auto offsets2 = offset_calc.get(idx + grp_sz * 2); + auto offsets3 = offset_calc.get(idx + grp_sz * 3); + void* out0 = data[0] + offsets0[0]; + void* out1 = data[0] + offsets1[0]; + void* out2 = data[0] + offsets2[0]; + void* out3 = data[0] + offsets3[0]; + auto u = c10::load(data[1] + offsets0[1]); + auto v = c10::load(data[2] + offsets0[2]); + ret_t result0 = f(c10::convert(u), c10::convert(v)); + auto u1 = c10::load(data[1] + offsets1[1]); + auto v1 = c10::load(data[2]+ offsets1[2]); + ret_t result1 = f(c10::convert(u1), c10::convert(v1)); + auto u2 = c10::load(data[1] + offsets2[1]); + auto v2 = c10::load(data[2] + offsets2[2]); + ret_t result2 = f(c10::convert(u2), c10::convert(v2)); + auto u3 = c10::load(data[1] + offsets3[1]); + auto v3 = c10::load(data[2] + offsets3[2]); + ret_t result3 = f(c10::convert(u3), c10::convert(v3)); + *(ret_cpp_t*)out0 = c10::convert(result0); + *(ret_cpp_t*)out1 = c10::convert(result1); + *(ret_cpp_t*)out2 = c10::convert(result2); + *(ret_cpp_t*)out3 = c10::convert(result3); + } else { + auto offsets = offset_calc.get(idx); + void* out = data[0] + offsets[0]; + auto u = c10::load(data[1] + offsets[1]); + auto v = c10::load(data[2] + offsets[2]); + ret_t result = f(c10::convert(u), c10::convert(v)); + *(ret_cpp_t*)out = c10::convert(result); + } + }); + } + } +}; + } // namespace #endif @@ -1002,6 +1065,32 @@ void gpu_kernel_impl(TensorIteratorBase& iter, const func_t& f) { } auto offset_calc = ::make_offset_calculator(iter); #ifdef USE_ROCM + if (check_binary_rt_types_for_specialization(iter)) { + // constexpr to reduce the amount of kernels generated for + // broadcast elementwise with mexed dtypes and limit which functors are actually + // applied to the load and store at compile time. + using func_tuple = typename traits::ArgsTuple; + if constexpr ( + std::is_same_v && traits::arity == 2 && + check_binary_functor_types_for_specialization< + func_tuple, + float, + float, + traits::arity, + /*arg_num=*/0>::check()) { + memory::detail::static_unroll< + type_specialized_broadcast_kernel_launcher, + rt_binary_specializations.size()>::with_args( + numel, + f, + data, + dtypes, + offset_calc + ); + return; + } + } + constexpr int grp_sz = 128; launch_legacy_kernel_manual_unroll(numel, [=] GPU_LAMBDA(int idx, bool unrl) { if (unrl) { diff --git a/aten/src/ATen/native/cuda/CompositeRandomAccessor.h b/aten/src/ATen/native/cuda/CompositeRandomAccessor.h index d47a7fa776f1b..eb8587d1f9337 100644 --- a/aten/src/ATen/native/cuda/CompositeRandomAccessor.h +++ b/aten/src/ATen/native/cuda/CompositeRandomAccessor.h @@ -1,6 +1,7 @@ #pragma once #include +#include #include namespace at { namespace native { diff --git a/aten/src/ATen/native/cuda/DilatedMaxPool2d.cu b/aten/src/ATen/native/cuda/DilatedMaxPool2d.cu index 344906a2a4df2..88c552e9bf120 100644 --- a/aten/src/ATen/native/cuda/DilatedMaxPool2d.cu +++ b/aten/src/ATen/native/cuda/DilatedMaxPool2d.cu @@ -75,30 +75,52 @@ static inline bool can_use_int32_nhwc( return true; } +static inline bool can_use_int32_nchw( + int64_t nbatch, int64_t channels, + int64_t height, int64_t width, + int64_t pooled_height, int64_t pooled_width) { + int64_t hw = height * width; + return can_use_int32_nhwc( + nbatch, channels, height, width, + pooled_height, pooled_width, + channels * hw, // in_stride_n + hw, // in_stride_c + width, // in_stride_h + 1 // in_stride_w + ); +} + // kernels borrowed from Caffe -template -__global__ void max_pool_forward_nchw(const int nthreads, const scalar_t* bottom_data, - const int64_t channels, const int64_t height, - const int64_t width, const int pooled_height, const int pooled_width, - const int kernel_h, const int kernel_w, const int stride_h, - const int stride_w, const int pad_h, const int pad_w, - const int dilation_h, const int dilation_w, scalar_t* top_data, +template +__global__ void max_pool_forward_nchw( + const index_t nthreads, + const scalar_t* bottom_data, + const int64_t channels, + const int64_t height, + const int64_t width, + const int pooled_height, + const int pooled_width, + const int kernel_h, const int kernel_w, + const int stride_h, const int stride_w, + const int pad_h, const int pad_w, + const int dilation_h, const int dilation_w, + scalar_t* top_data, int64_t* top_mask) { - CUDA_KERNEL_LOOP(index, nthreads) { - int pw = index % pooled_width; - int ph = (index / pooled_width) % pooled_height; - int c = (index / pooled_width / pooled_height) % channels; - int n = index / pooled_width / pooled_height / channels; - int hstart = ph * stride_h - pad_h; - int wstart = pw * stride_w - pad_w; - int hend = min(hstart + (kernel_h - 1) * dilation_h + 1, height); - int wend = min(wstart + (kernel_w - 1) * dilation_w + 1, width); + CUDA_KERNEL_LOOP_TYPE(index, nthreads, index_t) { + index_t pw = index % pooled_width; + index_t ph = (index / pooled_width) % pooled_height; + index_t c = (index / pooled_width / pooled_height) % channels; + index_t n = index / pooled_width / pooled_height / channels; + index_t hstart = ph * stride_h - pad_h; + index_t wstart = pw * stride_w - pad_w; + index_t hend = min(hstart + (kernel_h - 1) * dilation_h + 1, height); + index_t wend = min(wstart + (kernel_w - 1) * dilation_w + 1, width); while(hstart < 0) hstart += dilation_h; while(wstart < 0) wstart += dilation_w; scalar_t maxval = at::numeric_limits::lower_bound(); // -Infinity - int maxidx = hstart * width + wstart; + index_t maxidx = hstart * width + wstart; const scalar_t* btm_data = bottom_data + (n * channels + c) * height * width; for (int h = hstart; h < hend; h += dilation_h) { for (int w = wstart; w < wend; w += dilation_w) { @@ -251,32 +273,39 @@ __global__ void max_pool_forward_nhwc( static constexpr int BLOCK_THREADS = 256; -template +template #if defined (USE_ROCM) C10_LAUNCH_BOUNDS_2(BLOCK_THREADS, 4) #else C10_LAUNCH_BOUNDS_2(BLOCK_THREADS, 8) #endif -__global__ void max_pool_backward_nchw(const scalar_t* top_diff, - const int64_t* top_mask, const int num, const int64_t channels, - const int64_t height, const int64_t width, const int pooled_height, - const int pooled_width, const int kernel_h, const int kernel_w, - const int stride_h, const int stride_w, const int pad_h, const int pad_w, +__global__ void max_pool_backward_nchw( + const scalar_t* top_diff, + const int64_t* top_mask, + const index_t num, + const index_t channels, + const index_t height, + const index_t width, + const index_t pooled_height, + const index_t pooled_width, + const int kernel_h, const int kernel_w, + const int stride_h, const int stride_w, + const int pad_h, const int pad_w, const int dilation_h, const int dilation_w, scalar_t* bottom_diff) { - CUDA_KERNEL_LOOP(index, height*width) { - int h = index / width; - int w = index - h * width; - int phstart = p_start(h, pad_h, kernel_h, dilation_h, stride_h); - int phend = p_end(h, pad_h, pooled_height, stride_h); - int pwstart = p_start(w, pad_w, kernel_w, dilation_w, stride_w); - int pwend = p_end(w, pad_w, pooled_width, stride_w); - for (int n = blockIdx.y; n < num; n += gridDim.y) { - for (int c = blockIdx.z; c < channels; c+= gridDim.z) { + CUDA_KERNEL_LOOP_TYPE(index, height*width, index_t) { + index_t h = index / width; + index_t w = index - h * width; + index_t phstart = p_start(h, pad_h, kernel_h, dilation_h, stride_h); + index_t phend = p_end(h, pad_h, pooled_height, stride_h); + index_t pwstart = p_start(w, pad_w, kernel_w, dilation_w, stride_w); + index_t pwend = p_end(w, pad_w, pooled_width, stride_w); + for (index_t n = blockIdx.y; n < num; n += gridDim.y) { + for (index_t c = blockIdx.z; c < channels; c += gridDim.z) { accscalar_t gradient = accscalar_t(0); - int offset = (n * channels + c) * pooled_height * pooled_width; - for (int ph = phstart; ph < phend; ++ph) { - for (int pw = pwstart; pw < pwend; ++pw) { + index_t offset = (n * channels + c) * pooled_height * pooled_width; + for (index_t ph = phstart; ph < phend; ++ph) { + for (index_t pw = pwstart; pw < pwend; ++pw) { if (top_mask[ph * pooled_width + pw + offset] == h * width + w) { gradient += static_cast(top_diff[ph * pooled_width + pw + offset]); } @@ -469,8 +498,6 @@ const Tensor& indices) { const int64_t in_stride_h = input.stride(-2); const int64_t in_stride_w = input.stride(-1); - const int count = safe_downcast(output.numel()); - AT_DISPATCH_FLOATING_TYPES_AND2(kHalf, kBFloat16, input.scalar_type(), "max_pool2d_with_indices_out_cuda_frame", [&] { @@ -553,14 +580,42 @@ const Tensor& indices) { break; } case MemoryFormat::Contiguous: { - const int num_threads = std::min(at::cuda::getCurrentDeviceProperties()->maxThreadsPerBlock, - BLOCK_THREADS); - max_pool_forward_nchw - <<>>( - count, input_data, - nInputPlane, inputHeight, inputWidth, outputHeight, outputWidth, - kH, kW, dH, dW, padH, padW, dilationH, dilationW, - output_data, indices_data); + const int threads = std::min( + at::cuda::getCurrentDeviceProperties()->maxThreadsPerBlock, + BLOCK_THREADS); + const int64_t nthreads = output.numel(); + bool use_int32 = can_use_int32_nchw( + nbatch, nInputPlane, inputHeight, inputWidth, outputHeight, outputWidth); + const int maxGridX = at::cuda::getCurrentDeviceProperties()->maxGridSize[0]; + const int blocks = static_cast(std::min( + ceil_div(nthreads, static_cast(threads)), + static_cast(maxGridX))); + auto stream = at::cuda::getCurrentCUDAStream(); + if (use_int32) { + max_pool_forward_nchw + <<>>( + static_cast(nthreads), + input_data, + static_cast(nInputPlane), + static_cast(inputHeight), + static_cast(inputWidth), + static_cast(outputHeight), + static_cast(outputWidth), + kH, kW, dH, dW, padH, padW, dilationH, dilationW, + output_data, indices_data); + } else { + max_pool_forward_nchw + <<>>( + nthreads, + input_data, + nInputPlane, + inputHeight, + inputWidth, + outputHeight, + outputWidth, + kH, kW, dH, dW, padH, padW, dilationH, dilationW, + output_data, indices_data); + } C10_CUDA_KERNEL_LAUNCH_CHECK(); break; } @@ -633,8 +688,6 @@ const Tensor& gradInput) { gradInput.zero_(); - int64_t count = input.numel(); - AT_DISPATCH_FLOATING_TYPES_AND2(kHalf, kBFloat16, input.scalar_type(), "max_pool2d_with_indices_out_cuda_frame", [&] { @@ -692,25 +745,45 @@ const Tensor& gradInput) { break; } case MemoryFormat::Contiguous: { - int imgcount = inputWidth * inputHeight; - dim3 grid; - const int blocks = (imgcount + BLOCK_THREADS - 1) / BLOCK_THREADS; - grid.x = blocks; - grid.y = nbatch; - uint64_t maxGridY = at::cuda::getCurrentDeviceProperties()->maxGridSize[1]; - if (maxGridY < grid.y) grid.y = maxGridY; - grid.z = nInputPlane; - uint64_t maxGridZ = at::cuda::getCurrentDeviceProperties()->maxGridSize[2]; - if (maxGridZ < grid.z) grid.z = maxGridZ; - - max_pool_backward_nchw - <<>>( - gradOutput_data, - indices_data, - nbatch, - nInputPlane, inputHeight, inputWidth, outputHeight, outputWidth, - kH, kW, dH, dW, padH, padW, dilationH, dilationW, - gradInput_data); + const int threads = std::min( + at::cuda::getCurrentDeviceProperties()->maxThreadsPerBlock, + BLOCK_THREADS); + const int imgcount = inputWidth * inputHeight; + const int maxGridX = at::cuda::getCurrentDeviceProperties()->maxGridSize[0]; + const int maxGridY = at::cuda::getCurrentDeviceProperties()->maxGridSize[1]; + const int maxGridZ = at::cuda::getCurrentDeviceProperties()->maxGridSize[2]; + const int blocks_x = std::min(ceil_div(imgcount, threads), maxGridX); + dim3 grid(blocks_x, static_cast(std::min(nbatch, maxGridY)), static_cast(std::min(nInputPlane, maxGridZ))); + bool use_int32 = can_use_int32_nchw( + nbatch, nInputPlane, inputHeight, inputWidth, outputHeight, outputWidth); + auto stream = at::cuda::getCurrentCUDAStream(); + if (use_int32) { + max_pool_backward_nchw + <<>>( + gradOutput_data, + indices_data, + static_cast(nbatch), + static_cast(nInputPlane), + static_cast(inputHeight), + static_cast(inputWidth), + static_cast(outputHeight), + static_cast(outputWidth), + kH, kW, dH, dW, padH, padW, dilationH, dilationW, + gradInput_data); + } else { + max_pool_backward_nchw + <<>>( + gradOutput_data, + indices_data, + nbatch, + nInputPlane, + inputHeight, + inputWidth, + outputHeight, + outputWidth, + kH, kW, dH, dW, padH, padW, dilationH, dilationW, + gradInput_data); + } C10_CUDA_KERNEL_LAUNCH_CHECK(); break; } diff --git a/aten/src/ATen/native/cuda/GroupedBlas.cpp b/aten/src/ATen/native/cuda/GroupedBlas.cpp index f64eb317d0cca..64a1792e51916 100644 --- a/aten/src/ATen/native/cuda/GroupedBlas.cpp +++ b/aten/src/ATen/native/cuda/GroupedBlas.cpp @@ -22,6 +22,9 @@ #include #include #include +#ifdef USE_ROCM +#include +#endif #include #ifdef USE_FBGEMM_GENAI @@ -666,12 +669,26 @@ std::optional out_dtype) { // _scaled_mm_allowed_device is used here within _grouped_mm_cuda which seems incorrect since scale is not used. // the _grouped_mm_fallback should be safe for any ROCm GPU since it's just calling typical mm/bmm bool use_fast_path = false; + // On non CK system(w/ ROCm), make sure use_fast_path is false +#if defined(USE_ROCM_CK_GEMM) + if (at::detail::getCUDAHooks().isGPUArch({"gfx942", "gfx950"})) { + use_fast_path = true; + } +#endif //USE_ROCM_CK_GEMM #endif const auto out_dtype_ = _resolve_grouped_mm_out_dtype(mat_a, mat_b, out_dtype); Tensor out = create_grouped_gemm_output_tensor(mat_a, mat_b, offs, out_dtype_); if (use_fast_path) { // fast path, no d2h sync needed +#ifndef USE_ROCM at::cuda::detail::bf16bf16_grouped_mm(mat_a, mat_b, offs, bias, out); +#else +#if defined(USE_ROCM_CK_GEMM) + at::hip::detail::group_gemm_ck(mat_a, mat_b, offs, bias, out); +#else + TORCH_WARN("ROCm: Group Gemm through CK not selected."); +#endif //USE_ROCM_CK_GEMM +#endif } else { _grouped_mm_fallback(mat_a, mat_b, offs, bias, out_dtype, out); } diff --git a/aten/src/ATen/native/cuda/IndexKernel.cu b/aten/src/ATen/native/cuda/IndexKernel.cu index 927af661396cd..db85f62c8d124 100644 --- a/aten/src/ATen/native/cuda/IndexKernel.cu +++ b/aten/src/ATen/native/cuda/IndexKernel.cu @@ -5,7 +5,6 @@ #include #include #include -#include #include #include #include @@ -74,7 +73,6 @@ void gpu_index_kernel(TensorIteratorBase& iter, const IntArrayRef index_size, co char* const out_ptr = static_cast(iter.data_ptr(0)); char* const in_ptr = static_cast(iter.data_ptr(1)); - if (is_gather_like && num_indices==1) { const size_t element_size = iter.element_size(0); constexpr size_t alignment = 16; @@ -84,16 +82,9 @@ void gpu_index_kernel(TensorIteratorBase& iter, const IntArrayRef index_size, co auto ind_dim_size = index_size[0]; auto inp_stride_bytes = index_stride[0]; auto out_stride_bytes = iter.strides(0)[1]; - // avoid grid overflow in the fast kernel - const int64_t vec_chunks = ceil_div(slice_size, alignment); - const int64_t blocks_per_slice_upper = ceil_div(vec_chunks, (int64_t)launch_size_nd); - const int max_grid_y = at::cuda::getCurrentDeviceProperties()->maxGridSize[1]; - // if it's an eligible grid we use the fast path, otherwise default to slower path - if (blocks_per_slice_upper <= max_grid_y) { - at::native::vectorized_gather_kernel_launch(out_ptr, in_ptr, (int64_t*)iter.data_ptr(2), num_ind, - slice_size, ind_dim_size, inp_stride_bytes, out_stride_bytes, /*allow_neg_indices*/true); - return; - } + at::native::vectorized_gather_kernel_launch(out_ptr, in_ptr, (int64_t*)iter.data_ptr(2), num_ind, + slice_size, ind_dim_size, inp_stride_bytes, out_stride_bytes, /*allow_neg_indices*/true); + return; } } diff --git a/aten/src/ATen/native/cuda/IndexKernelUtils.cu b/aten/src/ATen/native/cuda/IndexKernelUtils.cu index 8343c60418952..1e998251dd7be 100644 --- a/aten/src/ATen/native/cuda/IndexKernelUtils.cu +++ b/aten/src/ATen/native/cuda/IndexKernelUtils.cu @@ -13,11 +13,12 @@ __global__ void vectorized_gather_kernel(char * out, char * inp, index_t * idx, if (allow_neg_indices) { ind = (ind < 0) ? ind + ind_dim_size : ind; } - CUDA_KERNEL_ASSERT_VERBOSE(ind >=0 && ind < ind_dim_size && "vectorized gather kernel index out of bounds", "Expected 0 <= index < ind_dim_size(%ld), but got index = %ld", ind_dim_size, ind); - int32_t off = (blockDim.x * blockIdx.y + threadIdx.x) * Alignment; // off is guaranteed to be within int32 limits - if (off >= slice_size) return; - auto vec = at::native::memory::ld_vec(inp + ind * inp_stride + off); - at::native::memory::st_vec(out + blockIdx.x * (int32_t)out_stride + off, vec); // out offset is guaranteed to be within int32 limits + CUDA_KERNEL_ASSERT_VERBOSE(ind >=0 && ind < ind_dim_size && "vectorized gather kernel index out of bounds"); + // off is guaranteed to be within int32 limits + for (int32_t off = (blockDim.x * blockIdx.y + threadIdx.x) * Alignment; off < slice_size; off += blockDim.x * gridDim.y * Alignment) { + auto vec = at::native::memory::ld_vec(inp + ind * inp_stride + off); + at::native::memory::st_vec(out + blockIdx.x * (int32_t)out_stride + off, vec); // out offset is guaranteed to be within int32 limits + } } @@ -30,7 +31,9 @@ void vectorized_gather_kernel_launch(char * out, char * inp, index_t * idx, int auto num_threads = at::round_up( at::ceil_div(slice_size_in_bytes, Alignment), static_cast(C10_WARP_SIZE)); - dim3 grid = {static_cast(num_ind), static_cast(at::ceil_div(slice_size_in_bytes, max_num_threads * Alignment)), 1}; + uint32_t grid_y = at::cuda::getCurrentDeviceProperties()->maxGridSize[1]; + grid_y = std::min(static_cast(at::ceil_div(slice_size_in_bytes, max_num_threads * Alignment)), grid_y); + dim3 grid = {static_cast(num_ind), grid_y, 1}; auto block = std::min(max_num_threads, num_threads); vectorized_gather_kernel<<>>(out, inp, idx, num_ind, slice_size_in_bytes, ind_dim_size, inp_stride_bytes, out_stride_bytes, allow_neg_indices); diff --git a/aten/src/ATen/native/cuda/ScaledBlas.cpp b/aten/src/ATen/native/cuda/ScaledBlas.cpp index 0d2963874abbd..9065d79929360 100644 --- a/aten/src/ATen/native/cuda/ScaledBlas.cpp +++ b/aten/src/ATen/native/cuda/ScaledBlas.cpp @@ -59,6 +59,24 @@ // forward declare class cublasCommonArgs; +#ifndef _WIN32 +namespace fbgemm_gpu { + +// NOTE(slayton58): FBGemm_GPU kernels come from within the FBGemm repo. +// To update supported ops means a submodule bump, which is.. painful. Instead, we +// can simply forward-declare the methods we want to use.. Works at least as a short-term +// thing, but should still be fixed somewhere/somehow. +at::Tensor f4f4bf16( + at::Tensor, + at::Tensor, + at::Tensor, + at::Tensor, + std::optional, + bool use_mx); + +} // namespace fbgemm_gpu +#endif + using at::blas::ScalingType; using at::blas::SwizzleType; @@ -1087,26 +1105,47 @@ _scaled_mxfp4_mxfp4( const std::optional& bias, const c10::ScalarType out_dtype, Tensor& out) { -#ifndef USE_ROCM - TORCH_CHECK_NOT_IMPLEMENTED(false, "MXFP4 scaling supported on ROCM only"); -#endif +#if defined(_WIN32) || (!defined(USE_ROCM) && !defined(USE_FBGEMM_GENAI)) + TORCH_CHECK_NOT_IMPLEMENTED(false, "MXFP4 scaling supported on ROCM and CUDA+FBGEMM_GENAI only"); +#else // Restrictions: // A, B are FP4, scales are e8m0, A: shape K//32, B: K, N//32 TORCH_CHECK_VALUE(mat_a.scalar_type() == at::kFloat4_e2m1fn_x2 && mat_b.scalar_type() == at::kFloat4_e2m1fn_x2, "mat_a and mat_b must be fp4 types, got: ", mat_a.scalar_type(), mat_b.scalar_type()); - auto scale_a_elems = ceil_div(2 * mat_a.size(0), 32) * mat_a.size(1); - auto scale_b_elems = ceil_div(2 * mat_b.size(1), 32) * mat_b.size(0); + // Packed FP4 format means actual-K = 2 * reported-K -- adjust + auto K_multiplier = 2; +#ifdef USE_ROCM + // AMD + auto scale_a_elems = ceil_div(K_multiplier * mat_a.size(0), 32) * mat_a.size(1); + auto scale_b_elems = ceil_div(K_multiplier * mat_b.size(1), 32) * mat_b.size(0); +#else + // NVIDIA + auto scale_a_elems = round_up(mat_a.size(0), 128) * round_up(ceil_div(K_multiplier * mat_a.size(1), 32), 4); + auto scale_b_elems = round_up(mat_b.size(1), 128) * round_up(ceil_div(K_multiplier * mat_b.size(0), 32), 4); +#endif TORCH_CHECK_VALUE(scale_a_elems == scale_a.numel(), "For Blockwise scaling scale_a should have ", scale_a_elems, " elements, got: ", scale_a.numel()); TORCH_CHECK_VALUE(scale_b_elems == scale_b.numel(), "For Blockwise scaling scale_b should have ", scale_b_elems, " elements, got: ", scale_b.numel()); +#ifdef USE_ROCM + // AMD + TORCH_CHECK_VALUE(swizzle_a == SwizzleType::NO_SWIZZLE, "scale_a must not be swizzled (NO_SWIZZLE format)"); + TORCH_CHECK_VALUE(swizzle_b == SwizzleType::NO_SWIZZLE, "scale_b must not be swizzled (NO_SWIZZLE format)"); +#else + // NVIDIA + TORCH_CHECK_VALUE(swizzle_a == SwizzleType::SWIZZLE_32_4_4, "scale_a must be swizzled to SWIZZLE_32_4_4 format"); + TORCH_CHECK_VALUE(swizzle_b == SwizzleType::SWIZZLE_32_4_4, "scale_b must be swizzled to SWIZZLE_32_4_4 format"); +#endif + TORCH_CHECK_VALUE(scale_a.is_contiguous() && scale_b.is_contiguous(), "For Blockwise scaling both scales should be contiguous"); TORCH_CHECK_VALUE(out.scalar_type() == out_dtype, "expected out.scalar_type() to be ", out_dtype, ", but got ", out_dtype); +#ifdef USE_ROCM + // AMD auto scaling_choice_a = ScalingType::BlockWise1x32; auto scaling_choice_b = ScalingType::BlockWise1x32; @@ -1121,11 +1160,30 @@ _scaled_mxfp4_mxfp4( TORCH_CHECK_VALUE(out.scalar_type() == ScalarType::BFloat16 || out.scalar_type() == ScalarType::Half, "Block-wise scaling only supports BFloat16 or Half output types"); -#else - TORCH_CHECK_NOT_IMPLEMENTED(false, "Block-wise scaling for Float8_e8m0fnu requires ROCm 7.0 or later"); #endif return _scaled_gemm(mat_a, mat_b, scale_a, scale_b, scaling_choice_a, scaling_choice_b, bias, false /* use_fast_accum */, out); +#else + // NVIDIA + // NOTE(slayton58): fbgemm_gpu::f4f4bf16 does *not* allow passing an output tensor, + // but we have one we need to use. Two clear options are to copy into + // our output (slow), or use a move-assignment-operator (faster). + // However, the compiler can complain about the explicit move preventing + // copy elision because the return from f4f4bf16 is a temporary object. + // So we don't explicitly move, and trust the compiler here... + // In the longer term this should be fixed on the FBGemm side. + out = fbgemm_gpu::f4f4bf16( + mat_a, + mat_b.transpose(-2, -1), + scale_a, + scale_b, + std::nullopt, /* global_scale */ + true /* use_mx */ + ); + + return out; +#endif +#endif } Tensor& @@ -1250,17 +1308,20 @@ _scaled_mm_cuda_v2_out( mat_a.size(0), "x", mat_a.size(1), " and ", mat_b.size(0), "x", mat_b.size(1), ")"); } + // Handle fp4 packed-K dimension + int K_multiplier = (mat_a.scalar_type() == ScalarType::Float4_e2m1fn_x2) ? 2 : 1; + TORCH_CHECK_VALUE(!bias || bias->numel() == mat_b.sizes()[1], "Bias must be size ", mat_b.sizes()[1], " but got ", bias->numel()); TORCH_CHECK_VALUE( - mat_a.sizes()[1] % 16 == 0, + K_multiplier * mat_a.sizes()[1] % 16 == 0, "Expected trailing dimension of mat1 to be divisible by 16 ", "but got mat1 shape: (", mat_a.sizes()[0], "x", - mat_a.sizes()[1], + K_multiplier * mat_a.sizes()[1], ")."); - TORCH_CHECK_VALUE(mat_b.sizes()[0] % 16 == 0 && mat_b.sizes()[1] % 16 == 0, "mat2 shape (", mat_b.sizes()[0], "x", + TORCH_CHECK_VALUE(K_multiplier * mat_b.sizes()[0] % 16 == 0 && mat_b.sizes()[1] % 16 == 0, "mat2 shape (", mat_b.sizes()[0], "x", mat_b.sizes()[1], ") must be divisible by 16"); // TODO(slayton): Existing checks, not sure if they should really be here. diff --git a/aten/src/ATen/native/cuda/ScanUtils.cuh b/aten/src/ATen/native/cuda/ScanUtils.cuh index c4d86acb43e7b..693ad0cb6ce10 100644 --- a/aten/src/ATen/native/cuda/ScanUtils.cuh +++ b/aten/src/ATen/native/cuda/ScanUtils.cuh @@ -267,15 +267,15 @@ void scan_dim_with_indices(const TensorBase& self, const TensorBase& values, con * outer dimensions, which contains several "inner rows"). * Each thread processes a single inner row at a time. */ -template +template __global__ void tensor_kernel_scan_outer_dim(scalar_t *tgt_, const scalar_t *src_, const uint32_t num_orows, const uint32_t num_irows, const uint32_t row_size, const scalar_t init, BinaryOp binary_op) { for (uint32_t orow = blockIdx.x; orow < num_orows; orow += gridDim.x) { for (uint32_t irow = blockIdx.y * blockDim.x + threadIdx.x; irow < num_irows; irow += gridDim.y * blockDim.x) { - const scalar_t *src = src_ + orow * row_size * num_irows + irow; - scalar_t *tgt = tgt_ + orow * row_size * num_irows + irow; + const scalar_t *src = src_ + static_cast(orow) * row_size * num_irows + irow; + scalar_t *tgt = tgt_ + (index_t) orow * row_size * num_irows + irow; scalar_t acc = init; for (uint32_t col = 0; col < row_size; ++col) { @@ -409,10 +409,15 @@ __host__ void scan_outer_dim(const TensorBase& self, const TensorBase& result, check_fits_in_unsigned(num_irows, "num_irows"); check_fits_in_unsigned(num_orows, "num_orows"); check_fits_in_unsigned(row_size, "row_size"); - - tensor_kernel_scan_outer_dim<<>>( + if (static_cast(num_irows) * num_orows * row_size <= UINT_MAX) { + tensor_kernel_scan_outer_dim<<>>( + result.mutable_data_ptr(), self.const_data_ptr(), + num_orows, num_irows, row_size, init, binary_op); + } else { + tensor_kernel_scan_outer_dim<<>>( result.mutable_data_ptr(), self.const_data_ptr(), num_orows, num_irows, row_size, init, binary_op); + } C10_CUDA_KERNEL_LAUNCH_CHECK(); } diff --git a/aten/src/ATen/native/hip/ck_group_gemm.h b/aten/src/ATen/native/hip/ck_group_gemm.h new file mode 100644 index 0000000000000..c50307c9f8ea3 --- /dev/null +++ b/aten/src/ATen/native/hip/ck_group_gemm.h @@ -0,0 +1,19 @@ +#pragma once + +#include +#include +#include + +namespace at { +namespace hip { +namespace detail { +void group_gemm_ck( + const at::Tensor& mat_a, + const at::Tensor& mat_b, + const std::optional& offs, + const std::optional& bias, + at::Tensor& out); + +} // namespace detail +} // namespace hip +} // namespace at diff --git a/aten/src/ATen/native/hip/ck_group_gemm.hip b/aten/src/ATen/native/hip/ck_group_gemm.hip new file mode 100644 index 0000000000000..c436ad660c1c7 --- /dev/null +++ b/aten/src/ATen/native/hip/ck_group_gemm.hip @@ -0,0 +1,462 @@ +#undef __HIP_NO_HALF_CONVERSIONS__ +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +template +using S = ck::Sequence; + +namespace at { +namespace hip { +namespace detail { + +namespace CkTypes { + using BF16 = ck::bhalf_t; + using F16 = ck::half_t; + using F32 = float; + using PassThrough = ck::tensor_operation::element_wise::PassThrough; +} + +template +using GroupedGemmKernel = ck::tensor_operation::device::DeviceGroupedGemmMultipleDSplitKXdlCShuffleTwoStage< + ALayout, BLayout, ck::Tuple<>, ck::tensor_layout::gemm::RowMajor, + DataType, DataType, CkTypes::F32, DataType, ck::Tuple<>, DataType, + CkTypes::PassThrough, CkTypes::PassThrough, CkTypes::PassThrough, + ck::tensor_operation::device::GemmSpecialization::MNKPadding, + 1, 256, 256, 128, 32, 8, 8, 32, 32, 4, 2, + S<1,4,64,1>, S<0,2,1,3>, S<0,2,1,3>, + 3, 8, 8, 1, + S<1,4,64,1>, S<0,2,1,3>, S<0,2,1,3>, + 3, 8, 8, 1, + 1, 1, + S<1,32,1,8>, 4 +>; + +template +void launch_grouped_bgemm_ck_impl_dispatch( + const at::Tensor& mat_a, + const at::Tensor& mat_b, + const std::optional& offs, + at::Tensor& out) +{ + using DeviceOp = GroupedGemmKernel; + using PassThrough = CkTypes::PassThrough; + + std::vector gemm_descs; + std::vector p_a_ptrs, p_b_ptrs; + std::vector p_e_ptrs; + // Note: d_ptrs will be resized after we populate the other vectors + + const int mat_a_dim = mat_a.dim(); + const int mat_b_dim = mat_b.dim(); + + const char* a_ptr_base = reinterpret_cast(mat_a.data_ptr()); + const char* b_ptr_base = reinterpret_cast(mat_b.data_ptr()); + char* out_ptr_base = reinterpret_cast(out.data_ptr()); + const size_t a_element_size = mat_a.element_size(); + const size_t b_element_size = mat_b.element_size(); + const size_t out_element_size = out.element_size(); + + // for each group, calculate m,n,k,lda,ldb,ldc and A,B,out pointer base addresses. + if (mat_a_dim == 2 && mat_b_dim == 2) { + // 2D*2D case requires offset tensor + auto offs_accessor = offs->accessor(); + int num_groups = offs_accessor.size(0); + const int M = mat_a.size(0); // number of rows in A + const int N = mat_b.size(1); // number of columns in B + const int K = mat_a.size(1); // columns in A == rows in B + // for 2d*2d input, output is 3d. + // for each group, A columns (K) are sliced. M and N dimensions are not sliced. + for (int i = 0; i < num_groups; ++i) { + int start_k = (i == 0) ? 0 : offs_accessor[i-1]; + int end_k = offs_accessor[i]; + int k = end_k - start_k; + + //K dimension are sliced, hence select stride(1) always. + //K dimension is always dimension 1, regardless of memory layout (row/column major) + const void* group_a_ptr = a_ptr_base + start_k * mat_a.stride(1) * a_element_size; + const void* group_b_ptr; + int ldb; + + if (std::is_same::value) { + // Row-major B [K,N]: K values are horizontally adjacent, use stride(1) for K offset + group_b_ptr = b_ptr_base + start_k * mat_b.stride(1) * b_element_size; + // Leading dimension = distance between rows = stride(0) + ldb = mat_b.stride(0); + } else { + // Column-major B [K,N]: K values are vertically adjacent, use stride(0) for K offset + group_b_ptr = b_ptr_base + start_k * mat_b.stride(0) * b_element_size; + // Leading dimension = distance between columns = stride(1) + ldb = mat_b.stride(1); + } + + // Calculate output pointer for group i in 3D tensor [num_groups, M, N] + // stride(0) = M*N elements between groups, so skip i*stride(0) elements to reach group i + void* group_e_ptr = out_ptr_base + i * out.stride(0) * out_element_size; + int lda, ldc; + if (std::is_same::value) { + // Row-major A [M,K]: leading dimension = distance between rows = stride(0) + lda = mat_a.stride(0); + } else { + // Column-major A [M,K]: leading dimension = distance between columns = stride(1) + lda = mat_a.stride(1); + } + // Output is always row-major in 3D tensor [num_groups, M, N] + // Leading dimension for each group's [M,N] slice = stride(1) = N + ldc = out.stride(1); + size_t output_group_bytes = M * N * out_element_size; + void* group_e_ptr_end = (char*)group_e_ptr + output_group_bytes; + + gemm_descs.push_back({ + static_cast(M), + static_cast(N), + static_cast(k), + static_cast(lda), + static_cast(ldb), + static_cast(ldc), + {} // --> stride_Ds_ + }); + p_a_ptrs.push_back(group_a_ptr); + p_b_ptrs.push_back(group_b_ptr); + p_e_ptrs.push_back(group_e_ptr); + } + } else if (mat_a_dim == 2 && mat_b_dim == 3) { + // 2D*3D case requires offset tensor + auto offs_accessor = offs->accessor(); + int num_groups = offs_accessor.size(0); + + // 2d*3d input, output is 2d. + // A: [m * n_groups, k], B: [n_groups, n, k] or [n_groups, k, n], Output: [m * n_groups, n] + // Offset divides M dimension (rows of A), each group gets different rows of A and different batch of B + const int K = mat_a.size(1); // columns in A + // For 2D-3D case: The output determines N (result width) + const int N = out.size(1); // N is the width of the output tensor + + for (int i = 0; i < num_groups; ++i) { + int start_m = (i == 0) ? 0 : offs_accessor[i - 1]; + int end_m = offs_accessor[i]; + int m = end_m - start_m; + + // Skip zero-sized groups but continue processing subsequent groups + if (m <= 0) { + continue; + } + + // Select A rows for group i: skip start_m rows + const void* group_a_ptr; + int lda; + if (std::is_same::value) { + // Row-major A [total_m, K]: skip start_m rows, each row is stride(0) elements apart + group_a_ptr = a_ptr_base + start_m * mat_a.stride(0) * a_element_size; + lda = mat_a.stride(0); // distance between rows + } else { + // Column-major A [total_m, K]: skip start_m elements in the first dimension (stride(0) is between rows) + group_a_ptr = a_ptr_base + start_m * mat_a.stride(0) * a_element_size; + + // Detect stride pattern for A tensor to determine appropriate lda calculation + bool a_is_strided_tensor = (mat_a.stride(0) > mat_a.size(0)); + + if (a_is_strided_tensor) { + // For strided A tensors: stride(0) gives the actual leading dimension + lda = mat_a.stride(0); + } else { + // For non-strided A tensors: use the M dimension (total rows) + lda = mat_a.size(0); // Total M dimension for column-major layout + } + } + + // Select B batch for group i: B[i, :, :] + const void* group_b_ptr = b_ptr_base + i * mat_b.stride(0) * b_element_size; + int ldb; + + if (std::is_same::value) { + // Row-major GEMM: expecting B as [K, N] but we have [N, K], so transpose needed + ldb = mat_b.stride(2); // Leading dimension for accessing as [K, N] + } else { + // Detect stride pattern to determine appropriate ldb calculation + bool is_strided_tensor = (mat_b.stride(2) > mat_b.size(2)); + + if (is_strided_tensor) { + // For strided tensors: stride(2) gives the actual leading dimension + ldb = mat_b.stride(2); + } else { + // For non-strided tensors: use the N dimension + ldb = mat_b.size(1); + } + } + + // Output for this group: rows [start_m:end_m, :] in 2D output [total_m, N] + void* group_e_ptr = out_ptr_base + start_m * out.stride(0) * out_element_size; + int ldc = out.stride(0); // distance between rows in output (should be N for 2D case) + + gemm_descs.push_back({ + static_cast(m), + static_cast(N), + static_cast(K), + static_cast(lda), + static_cast(ldb), + static_cast(ldc), + {} // --> stride_Ds_ + }); + p_a_ptrs.push_back(group_a_ptr); + p_b_ptrs.push_back(group_b_ptr); + p_e_ptrs.push_back(group_e_ptr); + } + } else if (mat_a_dim == 3 && mat_b_dim == 3) { + // 3d*3d input, output is 3d - batched matrix multiplication + // A: [batch, m, k], B: [batch, k, n] or [batch, n, k] (depending on transpose), Output: [batch, m, n] + // Each batch is processed as a separate GEMM operation + const int batch_size = mat_a.size(0); + const int M = mat_a.size(1); // rows in each A matrix + const int K = mat_a.size(2); // columns in A == rows in B (or columns if B is transposed) + + // Determine N from B tensor - it could be B.size(1) or B.size(2) depending on layout + int N; + if (mat_b.size(1) == K) { + // B is [batch, k, n] - normal layout + N = mat_b.size(2); + } else if (mat_b.size(2) == K) { + // B is [batch, n, k] - transposed layout + N = mat_b.size(1); + } else { + TORCH_CHECK(false, "CK Group GEMM 3D-3D: B tensor dimensions incompatible with A. A=[", + batch_size, ",", M, ",", K, "], B=[", mat_b.size(0), ",", mat_b.size(1), ",", mat_b.size(2), "]"); + } + + for (int i = 0; i < batch_size; ++i) { + // Select A batch for group i: A[i, :, :] + const void* group_a_ptr = a_ptr_base + i * mat_a.stride(0) * a_element_size; + + // Select B batch for group i: B[i, :, :] + const void* group_b_ptr = b_ptr_base + i * mat_b.stride(0) * b_element_size; + + // Select output batch for group i: Output[i, :, :] + void* group_e_ptr = out_ptr_base + i * out.stride(0) * out_element_size; + + int lda, ldb, ldc; + + if (std::is_same::value) { + // Row-major A: leading dimension = distance between rows = stride(1) + lda = mat_a.stride(1); + } else { + // Column-major A: leading dimension = distance between columns = stride(2) + lda = mat_a.stride(2); + } + + if (std::is_same::value) { + // Row-major B: leading dimension = distance between rows + if (mat_b.size(1) == K) { + // B is [batch, k, n] - normal layout + ldb = mat_b.stride(1); // stride between K rows + } else { + // B is [batch, n, k] - transposed layout, treat as [k, n] for GEMM + ldb = mat_b.stride(2); // stride between N rows (since we're accessing as [k,n]) + } + } else { + // Column-major B: leading dimension = distance between columns + if (mat_b.size(1) == K) { + // B is [batch, k, n] - normal layout + ldb = mat_b.stride(2); // stride between N columns + } else { + // B is [batch, n, k] - transposed layout + ldb = mat_b.stride(1); // stride between K columns (since we're accessing as [n,k]→[k,n]) + } + } + + // Output is typically row-major: leading dimension = distance between rows = stride(1) + ldc = out.stride(1); + + gemm_descs.push_back({ + static_cast(M), + static_cast(N), + static_cast(K), + static_cast(lda), + static_cast(ldb), + static_cast(ldc), + {} // --> stride_Ds_ + }); + p_a_ptrs.push_back(group_a_ptr); + p_b_ptrs.push_back(group_b_ptr); + p_e_ptrs.push_back(group_e_ptr); + } + } else if (mat_a_dim == 3 && mat_b_dim == 2) { + // 3D*2D case requires offset tensor + auto offs_accessor = offs->accessor(); + int num_groups = offs_accessor.size(0); + // 3d*2d input, output is 3d. + // A: [n_groups, m, k], B: [k, total_n] (assuming row-major for both) + // Offset divides N dimension of B, each group gets different slice of B and different batch of A + const int batch_size = mat_a.size(0); // n_groups + const int M = mat_a.size(1); // rows in each A matrix + const int K = mat_a.size(2); // columns in A + + // For row-major A and B case: B should be [K, total_N] + const int total_N = mat_b.size(1); // B is [K, total_N] for row-major + + for (int i = 0; i < num_groups; ++i) { + int start_n = (i == 0) ? 0 : offs_accessor[i - 1]; + int end_n = offs_accessor[i]; + int n = end_n - start_n; + + // Skip zero-sized groups but continue processing subsequent groups + if (n <= 0) { + continue; + } + + // Select A batch for group i: A[i, :, :] + const void* group_a_ptr = a_ptr_base + i * mat_a.stride(0) * a_element_size; + + // Select B slice for group i: B[:, start_n:end_n] (B[K, total_N]) + const void* group_b_ptr; + int ldb; + + // Check if B is row-major or column-major + if (std::is_same::value) { + // Row-major B [K, total_N]: slice columns [start_n:end_n] + group_b_ptr = b_ptr_base + start_n * mat_b.stride(1) * b_element_size; + ldb = mat_b.stride(0); // distance between rows (should be total_N) + } else { + // Column-major B [K, total_N]: slice columns [start_n:end_n] + group_b_ptr = b_ptr_base + start_n * mat_b.stride(1) * b_element_size; + ldb = mat_b.stride(1); // distance between columns (should be K) + } + + // Select output slice for group i: Output[:, start_n:end_n] + void* group_e_ptr = out_ptr_base + start_n * out.stride(1) * out_element_size; + + int lda, ldc; + + // Row-major A: leading dimension = distance between rows = stride(1) + lda = mat_a.stride(1); + // Output is row-major: leading dimension = distance between rows = stride(0) + ldc = out.stride(0); + + gemm_descs.push_back({ + static_cast(M), + static_cast(n), + static_cast(K), + static_cast(lda), + static_cast(ldb), + static_cast(ldc), + {} // --> stride_Ds_ + }); + p_a_ptrs.push_back(group_a_ptr); + p_b_ptrs.push_back(group_b_ptr); + p_e_ptrs.push_back(group_e_ptr); + } + } else { + TORCH_CHECK(false, "CK Group GEMM: Unsupported dimensions, mat A dim is ", mat_a_dim, ", mat B dim is ", mat_b_dim); + } + + TORCH_INTERNAL_ASSERT(p_a_ptrs.size() > 0, "CK Group GEMM: No valid groups"); + + // Initialize d_ptrs with the correct size + std::vector> d_ptrs(p_a_ptrs.size()); + + static DeviceOp gemm_instance; + auto argument = gemm_instance.MakeArgument( + p_a_ptrs, p_b_ptrs, d_ptrs, p_e_ptrs, + gemm_descs, PassThrough{}, PassThrough{}, PassThrough{} + ); + TORCH_INTERNAL_ASSERT(gemm_instance.IsSupportedArgument(argument), + "CK Group GEMM: argument unsupported (shape/strides/type config)"); + size_t arg_buf_size = gemm_instance.GetDeviceKernelArgSize(&argument); + size_t ws_size = gemm_instance.GetWorkSpaceSize(&argument); + + void* gemm_arg_buf = nullptr; + void* ws_buf = nullptr; + + hipMalloc(&gemm_arg_buf, arg_buf_size); + hipMalloc(&ws_buf, ws_size); + + gemm_instance.SetDeviceKernelArgs(&argument, gemm_arg_buf); + gemm_instance.SetWorkSpacePointer(&argument, ws_buf); + + auto invoker = gemm_instance.MakeInvoker(); + hipStream_t stream = c10::hip::getCurrentHIPStream(); + invoker.Run(argument, {stream}); + hipFree(gemm_arg_buf); + hipFree(ws_buf); +} + +void group_gemm_ck( + const at::Tensor& input_a, + const at::Tensor& input_b_colmajor, + const std::optional& offs, + const std::optional& /*bias*/, + at::Tensor& out) +{ + // Detect if input_a is row-major based on stride pattern + bool a_row_major = (input_a.dim() == 3) ? (input_a.stride(2) == 1) : (input_a.stride(1) == 1); + bool b_col_major = (input_b_colmajor.dim() == 3) ? (input_b_colmajor.stride(1) == 1) : (input_b_colmajor.stride(0) == 1); + // Ensure tensor A is row-major and contiguous if not already + at::Tensor mat_a = input_a; + if (!a_row_major) { + // If A is not row-major, make it contiguous (row-major) + mat_a = input_a.contiguous(); + } + // Force tensor B to be column-major using double transpose trick + // This guarantees stride(0) == 1 and stride(1) == K for [K, N] shape + at::Tensor mat_b = input_b_colmajor; + if (!b_col_major) { + mat_b = input_b_colmajor.transpose(-2, -1).contiguous().transpose(-2, -1); + } + + // For 3D tensors, check the last dimension stride for row-major detection + a_row_major = (mat_a.dim() == 3) ? (mat_a.stride(2) == 1) : (mat_a.stride(1) == 1); + bool b_row_major = (mat_b.dim() == 3) ? (mat_b.stride(2) == 1) : (mat_b.stride(1) == 1); + + if (mat_a.dtype() == at::kBFloat16) { + // bf16 path + if (a_row_major && b_row_major) { + launch_grouped_bgemm_ck_impl_dispatch(mat_a, mat_b, offs, out); + } else if (a_row_major && !b_row_major) { + launch_grouped_bgemm_ck_impl_dispatch(mat_a, mat_b, offs, out); + } else if (!a_row_major && b_row_major) { + launch_grouped_bgemm_ck_impl_dispatch(mat_a, mat_b, offs, out); + } else { + launch_grouped_bgemm_ck_impl_dispatch(mat_a, mat_b, offs, out); + } + } else if (mat_a.dtype() == at::kHalf) { + // fp16 path + if (a_row_major && b_row_major) { + launch_grouped_bgemm_ck_impl_dispatch(mat_a, mat_b, offs, out); + } else if (a_row_major && !b_row_major) { + launch_grouped_bgemm_ck_impl_dispatch(mat_a, mat_b, offs, out); + } else if (!a_row_major && b_row_major) { + launch_grouped_bgemm_ck_impl_dispatch(mat_a, mat_b, offs, out); + } else { + launch_grouped_bgemm_ck_impl_dispatch(mat_a, mat_b, offs, out); + } + } else if (mat_a.dtype() == at::kFloat) { + // fp32 path + if (a_row_major && b_row_major) { + launch_grouped_bgemm_ck_impl_dispatch(mat_a, mat_b, offs, out); + } else if (a_row_major && !b_row_major) { + launch_grouped_bgemm_ck_impl_dispatch(mat_a, mat_b, offs, out); + } else if (!a_row_major && b_row_major) { + launch_grouped_bgemm_ck_impl_dispatch(mat_a, mat_b, offs, out); + } else { + launch_grouped_bgemm_ck_impl_dispatch(mat_a, mat_b, offs, out); + } + } else { + TORCH_CHECK(false, "CK Group GEMM: Unsupported mat_a dtype"); + } + +} + +} // namespace detail +} // namespace hip +} // namespace at diff --git a/aten/src/ATen/native/mkldnn/xpu/Conv.cpp b/aten/src/ATen/native/mkldnn/xpu/Conv.cpp index 67558aeebbb83..1555eed558e29 100644 --- a/aten/src/ATen/native/mkldnn/xpu/Conv.cpp +++ b/aten/src/ATen/native/mkldnn/xpu/Conv.cpp @@ -337,10 +337,6 @@ Tensor _convolution_out( TORCH_CHECK( 3 == ndim || 4 == ndim || 5 == ndim, "convolution only supports 3D, 4D, 5D tensor"); - // get computation format for Conv/TransposedConv - bool is_channels_last_suggested = - use_channels_last_for_conv(input_r, weight_r); - Tensor input = input_r, weight = weight_r; // PyTorch does not support ChannelsLast1D case, // thus we need the transformation here @@ -348,13 +344,8 @@ Tensor _convolution_out( input = view4d(input_r); weight = view4d(weight_r); } - // ensure the input/weight/bias/output are congituous in desired format - at::MemoryFormat mfmt = is_channels_last_suggested - ? get_cl_tag_by_ndim(input.ndimension()) - : at::MemoryFormat::Contiguous; - auto bias = bias_r.defined() ? bias_r.contiguous() : bias_r; - input = input.contiguous(mfmt); - weight = weight.contiguous(mfmt); + // get computation format for Conv/TransposedConv + bool is_channels_last_suggested = use_channels_last_for_conv(input, weight); auto k = weight.ndimension(); if (k == input.ndimension() + 1) { @@ -388,6 +379,14 @@ Tensor _convolution_out( expand_param_if_needed(output_padding_, "output_padding", dim); params.groups = groups_; } + + // ensure the input/weight/bias/output are congituous in desired format + at::MemoryFormat mfmt = is_channels_last_suggested + ? get_cl_tag_by_ndim(input.ndimension()) + : at::MemoryFormat::Contiguous; + auto bias = bias_r.defined() ? bias_r.contiguous() : bias_r; + input = input.contiguous(mfmt); + weight = weight.contiguous(mfmt); check_shape_forward(input, weight, bias, params, true); Tensor output; @@ -514,18 +513,9 @@ Tensor convolution_overrideable( at::borrow_from_optional_tensor(bias_r_opt); const Tensor& bias_r = *bias_r_maybe_owned; - auto k = weight_r.ndimension(); - at::MemoryFormat backend_memory_format = at::MemoryFormat::Contiguous; - if (xpu_conv_use_channels_last(input_r, weight_r)) { - backend_memory_format = (k == 5) ? at::MemoryFormat::ChannelsLast3d - : at::MemoryFormat::ChannelsLast; - } - Tensor input_c = input_r.contiguous(backend_memory_format); - Tensor weight_c = weight_r.contiguous(backend_memory_format); - return _convolution( - input_c, - weight_c, + input_r, + weight_r, bias_r, stride_, padding_, diff --git a/aten/src/ATen/native/mkldnn/xpu/ScaledBlas.cpp b/aten/src/ATen/native/mkldnn/xpu/ScaledBlas.cpp new file mode 100644 index 0000000000000..de5b2d9b62a60 --- /dev/null +++ b/aten/src/ATen/native/mkldnn/xpu/ScaledBlas.cpp @@ -0,0 +1,342 @@ +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS +#include +#include +#include +#include +#include +#include +#include + +#ifndef AT_PER_OPERATOR_HEADERS +#include +#include +#else +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#endif + +namespace at::native { + +using at::blas::ScalingType; +using at::blas::SwizzleType; + +namespace { +/* + * Scaling Type Determination: + * --------------------------- + * Conditions and corresponding Scaling Types: + * + * - If scale tensor is `Float8_e8m0fnu` or `Float8_e4m3fn`: + * - Returns BlockWise (with additional size checks). + * + * - Else if scale.numel() == 1: + * - Returns TensorWise. + * + * - Else if scale.dim() == 2 && scale.size(0) == outer_dim && scale.size(1) == + * 1: + * - Returns RowWise. + * + * - Otherwise: + * - Returns Error. + */ + +bool is_tensorwise_scaling(const at::Tensor& t, const at::Tensor& scale) { + return at::isFloat8Type(t.scalar_type()) && + scale.scalar_type() == at::kFloat && scale.numel() == 1; +} + +bool is_rowwise_scaling(const at::Tensor& t, const at::Tensor& scale) { + return ( + at::isFloat8Type(t.scalar_type()) && scale.scalar_type() == at::kFloat && + scale.dim() == 2 && scale.size(0) == t.size(0) && scale.size(1) == 1 && + scale.is_contiguous()); +} + +bool is_desired_scaling( + const at::Tensor& t, + const at::Tensor& scale, + ScalingType desired_scaling) { + auto result = desired_scaling == ScalingType::TensorWise + ? is_tensorwise_scaling(t, scale) + : is_rowwise_scaling(t, scale); + return result; +} + +std::pair get_joint_scaling( + std::initializer_list> options, + const at::Tensor& a, + const at::Tensor& b, + const at::Tensor& scale_a, + const at::Tensor& scale_b) { + for (auto [lhs, rhs] : options) { + if (is_desired_scaling(a, scale_a, lhs) && + is_desired_scaling(b.t(), scale_b.t(), rhs)) { + return {lhs, rhs}; + } + } + TORCH_CHECK( + false, + "Invalid scaling configuration.\n" + "- For TensorWise scaling, a and b should be float8, scales should be float and singletons.\n" + "- For RowWise scaling, a and b should be float8, scales should be float, scale_a should be (", + a.size(0), + ", 1) and scale_b should be (1, ", + b.size(1), + "), and both should be contiguous.\n" + "Got a.dtype()=", + a.scalar_type(), + ", scale_a.dtype()=", + scale_a.scalar_type(), + ", scale_a.size()=", + scale_a.sizes(), + ", scale_a.stride()=", + scale_a.strides(), + ", ", + "b.dtype()=", + b.scalar_type(), + ", scale_b.dtype()=", + scale_b.scalar_type(), + ", scale_b.size()=", + scale_b.sizes(), + " and scale_b.stride()=", + scale_b.strides()); +} + +Tensor& _scaled_gemm( + const Tensor& mat1, + const Tensor& mat2, + const Tensor& scale_a, + const Tensor& scale_b, + const ScalingType scaling_choice_a, + const ScalingType scaling_choice_b, + const std::optional& bias, + const bool use_fast_accum, + Tensor& out, + const std::optional& alpha = std::nullopt) { + // TODO: scale_result and alpha is not defined or used! + std::optional scaled_result = std::nullopt; + at::native::onednn::scaled_matmul( + mat1, + mat2, + out, + scale_a, + scale_b, + scaling_choice_a, + scaling_choice_b, + bias, + scaled_result, + use_fast_accum); + + return out; +} + +} // namespace + +// Computes matrix multiply + bias while applying scaling to input and output +// matrices Scales are only applicable when matrices are of Float8 type and +// assumed to be equal to 1.0 by default. If output matrix type is 16 or 32-bit +// type, scale_result is not applied. Known limitations: +// - Only works if mat1 is row-major and mat2 is column-major +// - Only works if matrices sizes are divisible by 32 +// - If 1-dimensional tensors are used then scale_a should be size = +// mat1.size(0) +// and scale_b should have size = to mat2.size(1) +// Arguments: +// - `mat1`: the first operand of the matrix multiply, can be type +// `torch.float8_e4m3fn` or `torch.float8_e5m2` +// - `mat2`: the second operand of the matrix multiply, can be type +// `torch.float8_e4m3fn` or `torch.float8_e5m2` +// - `bias`: the bias, can be type `torch.float16` or `torch.bfloat16` +// - `out_dtype`: the output dtype, can either be a float8 or a higher +// precision floating point type +// - `scale_a`: a tensor with the inverse scale of `mat1`, whose +// shape/strides/dtype depend on the scaling scheme +// - `scale_b`: a tensor with the inverse scale of `mat2`, whose +// shape/strides/dtype depend on the scaling scheme +// - `scale_result`: a scalar tensor with the scale of the output, only +// utilized if the output is a float8 type +// - `use_fast_accum`: Not applicable for XPU. For now, it should always be +// false. +// - `out`: a reference to the output tensor + +Tensor& _scaled_mm_out_xpu( + const Tensor& mat1, + const Tensor& mat2, + const Tensor& scale_a, + const Tensor& scale_b, + const std::optional& bias, + const std::optional& scale_result, + std::optional out_dtype, + bool use_fast_accum, + Tensor& out) { + // Note: fast_accum is not supported in XPU for now. + TORCH_CHECK(!use_fast_accum, "fast_accum is not supported in XPU for now."); + + TORCH_CHECK(mat1.dim() == 2, "mat1 must be a matrix"); + TORCH_CHECK(mat2.dim() == 2, "mat2 must be a matrix"); + + TORCH_CHECK( + mat1.sizes()[1] == mat2.sizes()[0], + "mat1 and mat2 shapes cannot be multiplied (", + mat1.sizes()[0], + "x", + mat1.sizes()[1], + " and ", + mat2.sizes()[0], + "x", + mat2.sizes()[1], + ")"); + + // Check what type of scaling we are doing based on inputs. This list is + // sorted by decreasing priority. + + // List of supported datatypes for XPU with oneDNN: + // https://uxlfoundation.github.io/oneDNN/dev_guide_matmul.html#data-types + auto [scaling_choice_a, scaling_choice_b] = get_joint_scaling( + { + std::make_pair(ScalingType::TensorWise, ScalingType::TensorWise), + std::make_pair(ScalingType::RowWise, ScalingType::RowWise), + }, + mat1, + mat2, + scale_a, + scale_b); + TORCH_CHECK( + !scale_result || + (scale_result->numel() == 1 && scale_result->scalar_type() == kFloat), + "scale_result must be a float scalar"); + TORCH_CHECK( + !bias || bias->numel() == mat2.sizes()[1], + "Bias must be size ", + mat2.sizes()[1], + " but got ", + bias->numel()); + TORCH_CHECK( + mat1.sizes()[1] % 16 == 0, + "Expected trailing dimension of mat1 to be divisible by 16 ", + "but got mat1 shape: (", + mat1.sizes()[0], + "x", + mat1.sizes()[1], + ")."); + TORCH_CHECK( + mat2.sizes()[0] % 16 == 0 && mat2.sizes()[1] % 16 == 0, + "mat2 shape (", + mat2.sizes()[0], + "x", + mat2.sizes()[1], + ") must be divisible by 16"); + // Check types + TORCH_CHECK( + !out_dtype || *out_dtype == out.scalar_type(), + "out_dtype must match output matrix type"); + TORCH_CHECK( + at::isFloat8Type(mat1.scalar_type()), + "Expected mat1 to be Float8 matrix got ", + mat1.scalar_type()); + TORCH_CHECK( + at::isFloat8Type(mat2.scalar_type()), + "Expected mat2 to be Float8 matrix got ", + mat2.scalar_type()); + // TODO: oneDNN Currently only supports e4m3 with group scales on BMG. Not + // support 2D scales, only 1D. Needs to add more checks there. + + if (bias) { + TORCH_CHECK( + bias->scalar_type() == kFloat || + bias->scalar_type() == c10::ScalarType::BFloat16 || + bias->scalar_type() == c10::ScalarType::Half, + "Bias must be Float32 or BFloat16 or Half, but got ", + bias->scalar_type()); + } + + { + auto bias_ = bias.value_or(Tensor()); + auto scale_result_ = scale_result.value_or(Tensor()); + + // NOLINTNEXTLINE(*c-array*) + TensorArg targs[]{ + {out, "out", 0}, + {mat1, "mat1", 1}, + {mat2, "mat2", 2}, + {bias_, "bias", 3}, + {scale_a, "scale_a", 4}, + {scale_b, "scale_b", 5}, + {scale_result_, "scale_result", 6}}; + checkAllSameGPU(__func__, targs); + } + + // Validation checks have passed lets resize the output to actual size + IntArrayRef mat1_sizes = mat1.sizes(); + IntArrayRef mat2_sizes = mat2.sizes(); + at::native::resize_output(out, {mat1_sizes[0], mat2_sizes[1]}); + + // If any of M, K, N is 0 - return early (the tensorwise/rowwise float8 gemm + // kernels do not support this case). + if (mat1_sizes[0] == 0 || mat1_sizes[1] == 0 || mat2_sizes[1] == 0) { + // `out` was created with `at::empty`. In the case where we are multiplying + // MxK by KxN and K is the zero dim, we need to initialize here to properly + // return a tensor of zeros. + if (mat1_sizes[1] == 0) { + out.zero_(); + } + + return out; + } + + // TODO: Scale_result is not supported by now!! + return _scaled_gemm( + mat1, + mat2, + scale_a, + scale_b, + scaling_choice_a, + scaling_choice_b, + bias, + use_fast_accum, + out); +} + +Tensor _scaled_mm_xpu( + const Tensor& mat_a, + const Tensor& mat_b, + const Tensor& scale_a, + const Tensor& scale_b, + const std::optional& bias, + const std::optional& scale_result, + std::optional out_dtype, + bool use_fast_accum) { + const auto out_dtype_ = out_dtype.value_or(mat_a.scalar_type()); + Tensor out = at::empty({0}, mat_a.options().dtype(out_dtype_)); + return _scaled_mm_out_xpu( + mat_a, + mat_b, + scale_a, + scale_b, + bias, + scale_result, + out_dtype, + use_fast_accum, + out); +} + +} // namespace at::native diff --git a/aten/src/ATen/native/mkldnn/xpu/detail/QConv.cpp b/aten/src/ATen/native/mkldnn/xpu/detail/QConv.cpp index 282f42f37a364..4d6cb1b81fac3 100644 --- a/aten/src/ATen/native/mkldnn/xpu/detail/QConv.cpp +++ b/aten/src/ATen/native/mkldnn/xpu/detail/QConv.cpp @@ -133,7 +133,7 @@ at::Tensor quantized_convolution( // supported in conv. mask_weight = weight_zero_points.numel() > 1 ? 1 : 0; if (groups > 1 && weight_zero_points.numel() > 1) - mask_weight = (2 ^ 0) | (2 ^ 1); // 2^0 (group) | 2^1 (output channel) + mask_weight = (1 << 0) | (1 << 1); // 2^0 (group) | 2^1 (output channel) dnnl::primitive_attr pattr; bool src_need_zp = (act_zero_point != 0); diff --git a/aten/src/ATen/native/mkldnn/xpu/detail/QMatmul.cpp b/aten/src/ATen/native/mkldnn/xpu/detail/QMatmul.cpp index ede01093ff3e7..f79dfadd65454 100644 --- a/aten/src/ATen/native/mkldnn/xpu/detail/QMatmul.cpp +++ b/aten/src/ATen/native/mkldnn/xpu/detail/QMatmul.cpp @@ -1,3 +1,4 @@ +#include #include #include #include @@ -8,7 +9,6 @@ #include namespace at::native::onednn { - at::Tensor broadcast_bias2D( at::Tensor& dst, at::Tensor& bias, @@ -328,4 +328,236 @@ void quantized_matmul( result.copy_(dst); } +// Describes how to configure oneDNN scales for a given role/ScalingType +struct ScaleSpec { + // specifies the way scale values will be applied to an ARG tensor. + int mask; + // specifies how scales are grouped along dimensions where + // multiple scale factors are used. + dnnl::memory::dims groups; + // specifies data type for scale factors. + dnnl::memory::data_type dtype; + + // Helper to compute expected number of elements for scale tensors + // arg_type: "src" for SRC (groups pattern {1, X}), + // "wei" for WEIGHTS (groups pattern {X, 1}) + int64_t expected_numel( + int64_t outer_dim, + int64_t inner_dim, + const std::string& arg_type) const { + if (groups == dnnl::memory::dims{1, 1}) + return 1; // tensorwise scaling + + TORCH_CHECK( + arg_type == "src" || arg_type == "wei", + "Expected arg_type to be 'src' or 'wei', but got '", + arg_type, + "'"); + + // For rowwise: SRC groups={1, K}, WEI groups={K, 1} + TORCH_INTERNAL_ASSERT( + (groups == dnnl::memory::dims{1, inner_dim} || + groups == dnnl::memory::dims{inner_dim, 1}), + "The groups must be either {1, inner_dim} or {inner_dim, 1}. But got ", + groups, + "."); + return outer_dim; + } + + // Normalize an incoming scale tensor to contiguous storage and appropriate + // dtype/view + at::Tensor normalize(const at::Tensor& scale) const { + TORCH_INTERNAL_ASSERT( + dtype == dnnl::memory::data_type::f32, + "tensor scale currently must be f32, but got scale dtype: ", + scale.scalar_type()); + return scale.to(at::kFloat).contiguous(); + } +}; + +// This function defines how to set scales mask and groups according to: +// https://github.com/uxlfoundation/oneDNN/blob/main/tests/benchdnn/doc/knobs_attr.md#--attr-scales +// The returned value will be used in +// `set_scales(arg, mask, groups, data_type)`. +inline ScaleSpec make_scale_spec( + at::blas::ScalingType scaling_type, + int64_t M, + int64_t K, + int64_t N, + const std::string& arg_type) { + TORCH_CHECK( + arg_type == "src" || arg_type == "wei", + "Expected arg_type to be 'src' or 'wei', but got '", + arg_type, + "'"); + TORCH_INTERNAL_ASSERT( + (scaling_type == at::blas::ScalingType::TensorWise || + scaling_type == at::blas::ScalingType::RowWise), + "Currently only support scaling_type for TensorWise or RowWise"); + int64_t dim = K; // Currently only K is used for grouping + bool is_src = (arg_type == "src"); + if (scaling_type == at::blas::ScalingType::TensorWise) { + // Scale tensorwise. The same as `--attr-scales=common`. + // mask=0 : scale whole tensor + // groups={1, 1}: indicates that there is only one group for scaling + return {0, {1, 1}, dnnl::memory::data_type::f32}; + } else { + // (scaling_type == at::blas::ScalingType::RowWise) + // Scale RowWise. The same as `--attr-scales=per_dim_01`. + // mask={(1 << 0) | (1 << 1)}: Scale on both dim0 and dim1 + // SRC: groups={1, K}, WEIGHTS: groups={K, 1} + return { + (1 << 0) | (1 << 1), + is_src ? dnnl::memory::dims{1, dim} : dnnl::memory::dims{dim, 1}, + dnnl::memory::data_type::f32}; + } +} + +sycl::event scaled_matmul( + const Tensor& mat1, + const Tensor& mat2, + Tensor& result, + const Tensor& scale_a, + const Tensor& scale_b, + at::blas::ScalingType scaling_choice_a, + at::blas::ScalingType scaling_choice_b, + const std::optional& bias, + const std::optional& scale_result, + bool use_fast_accum) { + auto& engine = GpuEngineManager::Instance().get_engine(); + auto& stream = GpuStreamManager::Instance().get_stream(); + + // This function will do steps with following steps + // 1. create memory descriptor + // 2. call write_to_dnnl_memory() to actually write memory + // 3. execute + + const int64_t M = mat1.size(0); + const int64_t K = mat1.size(1); + const int64_t N = mat2.size(1); + + // 1.1 Create memory descriptor + dnnl::memory::desc src_md = get_onednn_md(mat1); + dnnl::memory::desc weights_md = get_onednn_md(mat2); + dnnl::memory::desc dst_md = get_onednn_md(result); + + // scale_a and scale_b has already be checked in `is_desired_scaling()` call. + // So we could directly get their memory desc and set later. + dnnl::memory::desc scale_a_md = get_onednn_md(scale_a); + dnnl::memory::desc scale_b_md = get_onednn_md(scale_b); + + dnnl::memory::desc bias_md; + bool with_bias = bias.has_value(); + at::Tensor possible_reshaped_bias = bias.value_or(at::Tensor()); + if (with_bias) { + if (possible_reshaped_bias.dim() == 1) { + possible_reshaped_bias = + possible_reshaped_bias.reshape({1, possible_reshaped_bias.size(0)}); + bias_md = get_onednn_md(possible_reshaped_bias); + } else { + bias_md = get_onednn_md(possible_reshaped_bias); + } + } + + // 1.2 Create primitive descriptor and set scales mask + const ScaleSpec src_spec = make_scale_spec(scaling_choice_a, M, K, N, "src"); + const ScaleSpec wei_spec = make_scale_spec(scaling_choice_b, M, K, N, "wei"); + + dnnl::primitive_attr op_attr = dnnl::primitive_attr(); + +#if ONEDNN_SUPPORT_DETERMINISTIC + if (at::globalContext().deterministicAlgorithms() || + at::globalContext().deterministicMkldnn()) + op_attr.set_deterministic(true); +#endif + + std::vector default_groups; + op_attr.set_scales( + DNNL_ARG_SRC, src_spec.mask, src_spec.groups, src_spec.dtype); + op_attr.set_scales( + DNNL_ARG_WEIGHTS, wei_spec.mask, wei_spec.groups, wei_spec.dtype); + // scale_result tensor currently only supports scalar(TensorWise Scaling). + bool with_dst_scale = scale_result && scale_result->defined(); + if (with_dst_scale) { + op_attr.set_scales(DNNL_ARG_DST, 0, {1}, dnnl::memory::data_type::f32); + } + + op_attr.set_scratchpad_mode(dnnl::scratchpad_mode::user); + + // 1.3 Create the matmul primitive descriptor + dnnl::matmul::primitive_desc matmul_pd = with_bias + ? dnnl::matmul::primitive_desc( + engine, src_md, weights_md, bias_md, dst_md, op_attr) + : dnnl::matmul::primitive_desc( + engine, src_md, weights_md, dst_md, op_attr); + + // 1.4 (Possible) Additional Checks + // TODO: In case there are memory desc does not align with the actual tensor, + // we might need to reorder weights similar to CPU's reorder_if_differ_in() + // call. For example, weights not the same as matmul_pd.weights_desc(), + + // 2. Prepare memory + + // Create memory + auto src_usr_m = make_onednn_memory(src_md, engine, mat1.data_ptr()); + auto weights_usr_m = make_onednn_memory(weights_md, engine, mat2.data_ptr()); + auto dst_usr_m = make_onednn_memory(dst_md, engine, result.data_ptr()); + dnnl::memory b_usr_m; + if (with_bias) { + b_usr_m = + make_onednn_memory(bias_md, engine, possible_reshaped_bias.data_ptr()); + } + + // Prepare runtime scale memories (flat 1-D views) using the specs + auto make_scale_mem_from_spec = [&](const ScaleSpec& spec, + int64_t expected_numel, + const at::Tensor& scale_tensor) { + at::Tensor prepared = spec.normalize(scale_tensor); + TORCH_CHECK( + prepared.numel() == expected_numel, + "Scale buffer length mismatch. Expected ", + expected_numel, + ", got ", + prepared.numel()); + dnnl::memory::desc scale_md( + {prepared.numel()}, spec.dtype, dnnl::memory::format_tag::x); + return make_onednn_memory(scale_md, engine, prepared.data_ptr()); + }; + + auto scratchpad = + make_onednn_memory(matmul_pd.scratchpad_desc(), engine, nullptr); + + // 3. Setup Args for exec + std::unordered_map args; + args.insert({DNNL_ARG_SRC, src_usr_m}); + args.insert({DNNL_ARG_WEIGHTS, weights_usr_m}); + args.insert({DNNL_ARG_DST, dst_usr_m}); + args.insert({DNNL_ARG_SCRATCHPAD, scratchpad}); + if (with_bias) { + args.insert({DNNL_ARG_BIAS, b_usr_m}); + } + + // Attach runtime scales using specs + auto src_sc_mem = make_scale_mem_from_spec( + src_spec, src_spec.expected_numel(M, K, "src"), scale_a); + auto wei_sc_mem = make_scale_mem_from_spec( + wei_spec, wei_spec.expected_numel(N, K, "wei"), scale_b); + args.insert({DNNL_ARG_ATTR_SCALES | DNNL_ARG_SRC, src_sc_mem}); + args.insert({DNNL_ARG_ATTR_SCALES | DNNL_ARG_WEIGHTS, wei_sc_mem}); + if (with_dst_scale) { + // Bind single f32 scalar as DST scale + at::Tensor dst_scale_f32 = scale_result->to(at::kFloat).contiguous(); + dnnl::memory::desc dst_sc_md( + {1}, dnnl::memory::data_type::f32, dnnl::memory::format_tag::x); + auto dst_sc_mem = + make_onednn_memory(dst_sc_md, engine, dst_scale_f32.data_ptr()); + args.insert({DNNL_ARG_ATTR_SCALES | DNNL_ARG_DST, dst_sc_mem}); + } + + dnnl::matmul matmul_p = dnnl::matmul(matmul_pd); + sycl::event matmul_fwd_event = + dnnl::sycl_interop::execute(matmul_p, stream, args); + return matmul_fwd_event; +} + } // namespace at::native::onednn diff --git a/aten/src/ATen/native/mkldnn/xpu/detail/Utils.cpp b/aten/src/ATen/native/mkldnn/xpu/detail/Utils.cpp index 15f24e9cbb3a4..a8a6b870ff6b6 100644 --- a/aten/src/ATen/native/mkldnn/xpu/detail/Utils.cpp +++ b/aten/src/ATen/native/mkldnn/xpu/detail/Utils.cpp @@ -78,6 +78,10 @@ dnnl::memory::data_type get_onednn_dtype( return dnnl::memory::data_type::f32; case at::ScalarType::BFloat16: return dnnl::memory::data_type::bf16; + case at::ScalarType::Float8_e4m3fn: + return dnnl::memory::data_type::f8_e4m3; + case at::ScalarType::Float8_e5m2: + return dnnl::memory::data_type::f8_e5m2; default: if (!allow_undef) { TORCH_CHECK( diff --git a/aten/src/ATen/native/mkldnn/xpu/detail/oneDNN.h b/aten/src/ATen/native/mkldnn/xpu/detail/oneDNN.h index 6b2bf01e6d73d..bbe880b672b9d 100644 --- a/aten/src/ATen/native/mkldnn/xpu/detail/oneDNN.h +++ b/aten/src/ATen/native/mkldnn/xpu/detail/oneDNN.h @@ -1,6 +1,7 @@ #pragma once #include +#include #include #include #include @@ -202,4 +203,16 @@ void sdpa_backward( Tensor& grad_query, Tensor& grad_key, Tensor& grad_value); + +sycl::event scaled_matmul( + const Tensor& mat1, + const Tensor& mat2, + Tensor& result, + const Tensor& scale_a, + const Tensor& scale_b, + at::blas::ScalingType scaling_choice_a, + at::blas::ScalingType scaling_choice_b, + const std::optional& bias, + const std::optional& scale_result, + bool use_fast_accum); } // namespace at::native::onednn diff --git a/aten/src/ATen/native/mps/OperationUtils.h b/aten/src/ATen/native/mps/OperationUtils.h index 03b3076402d0a..21c56ab77a05f 100644 --- a/aten/src/ATen/native/mps/OperationUtils.h +++ b/aten/src/ATen/native/mps/OperationUtils.h @@ -40,8 +40,6 @@ using namespace at::mps; namespace at::native::mps { -void dispatch_sync_with_rethrow(dispatch_queue_t queue, void (^block)()); - struct MPSScalar { id getMTLBuffer() const { return __builtin_bit_cast(id, buffer.get()); diff --git a/aten/src/ATen/native/mps/OperationUtils.mm b/aten/src/ATen/native/mps/OperationUtils.mm index 96cd5d41959c3..32510869cf5d1 100644 --- a/aten/src/ATen/native/mps/OperationUtils.mm +++ b/aten/src/ATen/native/mps/OperationUtils.mm @@ -53,21 +53,6 @@ - (MPSGraphTensor*)maximumWithNaNPropagationAndIntFallbackWithPrimaryTensor:(MPS @end namespace at::native::mps { - -void dispatch_sync_with_rethrow(dispatch_queue_t queue, void (^block)()) { - __block std::optional block_exception; - dispatch_sync(queue, ^() { - try { - block(); - } catch (...) { - block_exception = std::current_exception(); - } - }); - if (block_exception) { - std::rethrow_exception(*block_exception); - } -} - /** * Computes distance from lowest to highest element offset in given tensor. */ diff --git a/aten/src/ATen/native/mps/kernels/Indexing.metal b/aten/src/ATen/native/mps/kernels/Indexing.metal index b41e64d70ced5..ebe078d01781e 100644 --- a/aten/src/ATen/native/mps/kernels/Indexing.metal +++ b/aten/src/ATen/native/mps/kernels/Indexing.metal @@ -1,4 +1,5 @@ #include +#include #include #include @@ -31,10 +32,24 @@ OffsetT index_apply_indices( constant IndexAB* indices, constant int64_t* sizes, constant int64_t* strides, - uint num_indices) { + uint num_indices, + thread bool& error, + device ErrorMessages* error_buf) { OffsetT rc = offs.x; for (uint i = 0; i < num_indices; i++) { auto idx = indices[i].indexArray[offs.y]; + if (idx < -sizes[i] || idx >= sizes[i]) { + TORCH_REPORT_ERROR( + error_buf, + "index ", + idx, + " is out of bounds for dimension ", + i, + " with size ", + sizes[i]); + error = true; + break; + } if (idx < 0) { idx += sizes[i]; } @@ -55,6 +70,7 @@ kernel void index_select( constant int64_t* index_sizes, constant int64_t* index_strides, constant uint4& ndim_nindices_numel, + device ErrorMessages* error_buffer, uint thread_index [[thread_position_in_grid]]) { const auto ndim = ndim_nindices_numel.x; const auto num_indices = ndim_nindices_numel.y; @@ -65,8 +81,19 @@ kernel void index_select( indices_strides, ndim, thread_index); + bool error = false; auto input_offs = index_apply_indices( - offs.yz, indices, index_sizes, index_strides, num_indices); + offs.yz, + indices, + index_sizes, + index_strides, + num_indices, + error, + error_buffer); + if (error) { + output[offs.x / sizeof(T)] = 0; + return; + } output[offs.x / sizeof(T)] = input[input_offs / sizeof(T)]; } @@ -82,7 +109,9 @@ inline void index_put_impl( constant int64_t* index_sizes, constant int64_t* index_strides, constant uint4& ndim_nindices_numel, + device ErrorMessages* error_buffer, uint thread_index) { + bool error = false; const auto ndim = ndim_nindices_numel.x; const auto num_indices = ndim_nindices_numel.y; const auto offs = index_get_offsets( @@ -93,7 +122,16 @@ inline void index_put_impl( ndim, thread_index); auto output_offs = index_apply_indices( - offs.xz, indices, index_sizes, index_strides, num_indices); + offs.xz, + indices, + index_sizes, + index_strides, + num_indices, + error, + error_buffer); + if (error) { + return; + } output[output_offs / sizeof(T)] = input[offs.y / sizeof(T)]; } @@ -109,6 +147,7 @@ kernel void index_put( constant int64_t* index_sizes, constant int64_t* index_strides, constant uint4& ndim_nindices_numel, + device ErrorMessages* error_buffer, uint thread_index [[thread_position_in_grid]]) { index_put_impl( output, @@ -121,6 +160,7 @@ kernel void index_put( index_sizes, index_strides, ndim_nindices_numel, + error_buffer, thread_index); } @@ -136,6 +176,7 @@ kernel void index_put_serial( constant int64_t* index_sizes, constant int64_t* index_strides, constant uint4& ndim_nindices_numel, + device ErrorMessages* error_buffer, uint thread_index [[thread_position_in_grid]]) { (void)thread_index; // Suppress unused vairable varning for (uint idx = 0; idx < ndim_nindices_numel.z; ++idx) { @@ -150,6 +191,7 @@ kernel void index_put_serial( index_sizes, index_strides, ndim_nindices_numel, + error_buffer, idx); } } @@ -166,6 +208,7 @@ kernel void index_put_accumulate( constant int64_t* index_sizes, constant int64_t* index_strides, constant uint4& ndim_nindices_numel, + device ErrorMessages* error_buffer, uint thread_index [[thread_position_in_grid]]) { const auto ndim = ndim_nindices_numel.x; const auto num_indices = ndim_nindices_numel.y; @@ -176,8 +219,18 @@ kernel void index_put_accumulate( indices_strides, ndim, thread_index); + bool error = false; auto output_offs = index_apply_indices( - offs.xz, indices, index_sizes, index_strides, num_indices); + offs.xz, + indices, + index_sizes, + index_strides, + num_indices, + error, + error_buffer); + if (error) { + return; + } AtomicType::atomic_add( reinterpret_cast*>(output), output_offs / sizeof(T), @@ -197,6 +250,7 @@ kernel void index_put_accumulate( constant int64_t* index_sizes, \ constant int64_t* index_strides, \ constant uint4& ndim_nindices_numel, \ + device ErrorMessages* error_buffer, \ uint thread_index [[thread_position_in_grid]]) #define REGISTER_INDEX_OP_ALL_DTYPES(OP_NAME) \ diff --git a/aten/src/ATen/native/mps/kernels/LinearAlgebra.metal b/aten/src/ATen/native/mps/kernels/LinearAlgebra.metal index c356dbf9ecb38..496ba73f479bd 100644 --- a/aten/src/ATen/native/mps/kernels/LinearAlgebra.metal +++ b/aten/src/ATen/native/mps/kernels/LinearAlgebra.metal @@ -40,7 +40,7 @@ inline c10::metal::opmath_t matmul_inner( threadgroup_barrier(mem_flags::mem_threadgroup); for (uint k = 0; k < TILE_DIM; k++) { - sum += A_tile[tid.y][k] * B_tile[k][tid.x]; + sum += c10::metal::mul(A_tile[tid.y][k], B_tile[k][tid.x]); } threadgroup_barrier(mem_flags::mem_threadgroup); @@ -832,6 +832,10 @@ INSTANTIATE_MM_OPS(float); INSTANTIATE_MM_OPS(half); INSTANTIATE_MM_OPS(bfloat); +// Complex MM +INSTANTIATE_MM_OPS(float2); +INSTANTIATE_MM_OPS(half2); + // Integral MM INSTANTIATE_MM_OPS(long); INSTANTIATE_MM_OPS(int); diff --git a/aten/src/ATen/native/mps/operations/Blas.mm b/aten/src/ATen/native/mps/operations/Blas.mm index 16d744cedb8ef..5ebf5f604bfc1 100644 --- a/aten/src/ATen/native/mps/operations/Blas.mm +++ b/aten/src/ATen/native/mps/operations/Blas.mm @@ -141,6 +141,9 @@ Tensor dot_mps(const Tensor& self, const Tensor& other) { }; MPSStream* stream = at::mps::getCurrentMPSStream(); + if (result.numel() == 0) { + return result; + } Tensor matMulVec = at::mm(mat, vec.unsqueeze(1)).squeeze(1); @autoreleasepool { diff --git a/aten/src/ATen/native/mps/operations/EmbeddingBag.mm b/aten/src/ATen/native/mps/operations/EmbeddingBag.mm index e6690b2531f0d..d7916ccdf875d 100644 --- a/aten/src/ATen/native/mps/operations/EmbeddingBag.mm +++ b/aten/src/ATen/native/mps/operations/EmbeddingBag.mm @@ -220,7 +220,7 @@ Tensor _embedding_bag_dense_backward_mps(const Tensor& output_grad, auto num_threads = (params.mode == EmbeddingBagMode::MAX) ? output_grad.numel() : num_indices * params.feature_size; MPSStream* stream = getCurrentMPSStream(); - mps::dispatch_sync_with_rethrow(stream->queue(), ^() { + dispatch_sync_with_rethrow(stream->queue(), ^() { @autoreleasepool { id computeEncoder = stream->commandEncoder(); auto pipeline_state = lib.getPipelineStateForFunc(fmt::format("embedding_bag_backward_{}_{}", @@ -273,7 +273,7 @@ Tensor _embedding_bag_per_sample_weights_backward_mps(const Tensor& output_grad, auto num_threads = num_indices * feature_size; MPSStream* stream = getCurrentMPSStream(); - mps::dispatch_sync_with_rethrow(stream->queue(), ^() { + dispatch_sync_with_rethrow(stream->queue(), ^() { @autoreleasepool { id computeEncoder = stream->commandEncoder(); auto pipeline_state = lib.getPipelineStateForFunc(fmt::format("embedding_bag_per_sample_weights_backward_{}_{}", diff --git a/aten/src/ATen/native/mps/operations/Indexing.mm b/aten/src/ATen/native/mps/operations/Indexing.mm index 0b0a84c45a52c..2a21f3f8aadca 100644 --- a/aten/src/ATen/native/mps/operations/Indexing.mm +++ b/aten/src/ATen/native/mps/operations/Indexing.mm @@ -179,7 +179,8 @@ static void dispatch_index_kernel(TensorIteratorBase& iter, iter.strides(2), index_size, index_stride, - ndim_nindiees); + ndim_nindiees, + mpsStream->getErrorBuffer()); mtl_dispatch1DJob(computeEncoder, indexSelectPSO, serial ? 1 : iter.numel()); }); } @@ -299,7 +300,7 @@ static Tensor nonzero_fallback(const Tensor& self) { MPSStream* stream = getCurrentMPSStream(); using CachedGraph = MPSUnaryCachedGraph; - dispatch_sync(stream->queue(), ^() { + dispatch_sync_with_rethrow(stream->queue(), ^() { stream->synchronize(SyncType::COMMIT_AND_WAIT); }); int64_t total_nonzero = at::count_nonzero(self).item(); @@ -384,7 +385,7 @@ static Tensor nonzero_fallback(const Tensor& self) { MPSStream* stream = getCurrentMPSStream(); using CachedGraph = MPSUnaryCachedGraph; - dispatch_sync(stream->queue(), ^() { + dispatch_sync_with_rethrow(stream->queue(), ^() { stream->synchronize(SyncType::COMMIT_AND_WAIT); }); int64_t total_nonzero = at::count_nonzero(self).item(); diff --git a/aten/src/ATen/native/mps/operations/LinearAlgebra.mm b/aten/src/ATen/native/mps/operations/LinearAlgebra.mm index aed417ca9ca92..59cc1c6527e8e 100644 --- a/aten/src/ATen/native/mps/operations/LinearAlgebra.mm +++ b/aten/src/ATen/native/mps/operations/LinearAlgebra.mm @@ -190,10 +190,16 @@ bool use_metal_mm(const Tensor& self, const Tensor& other, const Tensor& output) { static bool always_use_metal = c10::utils::has_env("PYTORCH_MPS_PREFER_METAL"); constexpr auto max_stride_size = 32768; + constexpr auto max_complex_inner_size = 2048; static bool is_macos_14_4_or_newer = is_macos_13_or_newer(MacOSVersion::MACOS_VER_14_4_PLUS); if (always_use_metal || c10::isIntegralType(self.scalar_type(), true)) { return true; } + // multiplicationWithPrimaryTensor: returns incorrect results if inner size exceeds 2048 + // See https://github.com/pytorch/pytorch/issues/167727#issuecomment-3529308548 + if (c10::isComplexType(self.scalar_type()) && self.size(1) > max_complex_inner_size) { + return true; + } return !is_macos_14_4_or_newer && (self.stride(0) > max_stride_size || self.stride(1) > max_stride_size || self.size(0) > max_stride_size || self.size(1) > max_stride_size || other.stride(0) > max_stride_size || other.stride(1) > max_stride_size || diff --git a/aten/src/ATen/native/mps/operations/LossOps.mm b/aten/src/ATen/native/mps/operations/LossOps.mm index c995b8fc237f3..f0bbcdabfa5cd 100644 --- a/aten/src/ATen/native/mps/operations/LossOps.mm +++ b/aten/src/ATen/native/mps/operations/LossOps.mm @@ -212,17 +212,12 @@ loss.resize_((reduction == Reduction::None || grad_output.defined()) ? target.sizes() : IntArrayRef({})); TORCH_CHECK(loss.is_mps()); - Tensor loss_squeezed = loss.squeeze(); - Tensor input_squeezed = input.squeeze(); - Tensor target_squeezed = target.squeeze(); - @autoreleasepool { - std::string key = - op_name + reductionToString(reduction) + getTensorsStringKey({input_squeezed, target_squeezed, weight}); + std::string key = op_name + reductionToString(reduction) + getTensorsStringKey({input, target, weight}); auto cachedGraph = LookUpOrCreateCachedGraph(key, [&](auto mpsGraph, auto newCachedGraph) { - newCachedGraph->inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, input_squeezed); - newCachedGraph->targetTensor = mpsGraphRankedPlaceHolder(mpsGraph, target_squeezed); + newCachedGraph->inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, input); + newCachedGraph->targetTensor = mpsGraphRankedPlaceHolder(mpsGraph, target); MPSGraphTensor* bceLossUnweighted = nil; // if grad_output is defined, then it's a backward pass @@ -252,12 +247,12 @@ newCachedGraph->gradInputTensor = bceLoss; } } else { - newCachedGraph->lossTensor = reduceTensor(bceLoss, reduction, mpsGraph, input_squeezed.sizes().size()); + newCachedGraph->lossTensor = reduceTensor(bceLoss, reduction, mpsGraph, input.sizes().size()); } }); - Placeholder inputPlaceholder = Placeholder(cachedGraph->inputTensor, input_squeezed); - Placeholder targetPlaceholder = Placeholder(cachedGraph->targetTensor, target_squeezed); - Placeholder lossPlaceholder = Placeholder(cachedGraph->lossTensor, loss_squeezed); + Placeholder inputPlaceholder = Placeholder(cachedGraph->inputTensor, input); + Placeholder targetPlaceholder = Placeholder(cachedGraph->targetTensor, target); + Placeholder lossPlaceholder = Placeholder(cachedGraph->lossTensor, loss); NSMutableDictionary* feeds = [[NSMutableDictionary new] autorelease]; diff --git a/aten/src/ATen/native/mps/operations/Normalization.mm b/aten/src/ATen/native/mps/operations/Normalization.mm index 0c95fec667e80..7441692b6c291 100644 --- a/aten/src/ATen/native/mps/operations/Normalization.mm +++ b/aten/src/ATen/native/mps/operations/Normalization.mm @@ -923,7 +923,7 @@ Check if running mean exists (maybe do this check before making graph) MPSStream* stream = getCurrentMPSStream(); TORCH_CHECK_NOT_IMPLEMENTED(input.scalar_type() != kLong, "Not implemented for long on MPS"); @autoreleasepool { - mps::dispatch_sync_with_rethrow(stream->queue(), ^() { + dispatch_sync_with_rethrow(stream->queue(), ^() { // which kernel variant to use based on the normalized axis N size const int N_READS = 4; auto metalType = mps::scalarToMetalTypeString(input); diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml index 4424f51827d45..98873abe0c499 100644 --- a/aten/src/ATen/native/native_functions.yaml +++ b/aten/src/ATen/native/native_functions.yaml @@ -192,6 +192,11 @@ CompositeExplicitAutograd: _assert_tensor_metadata Meta: _assert_tensor_metadata_meta_symint +- func: _async_error(str msg) -> () + dispatch: + CompositeExplicitAutograd: _async_error + Meta: _async_error_meta + - func: _print(str s) -> () dispatch: CompositeExplicitAutograd: _print @@ -2803,7 +2808,7 @@ - func: floor_divide.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) device_check: NoCheck # TensorIterator dispatch: - CPU, CUDA, MPS: floor_divide_out + CPU, CUDA, MPS, MTIA: floor_divide_out SparseCPU, SparseCUDA, SparseMPS: floor_divide_out_sparse_zerodim - func: floor_divide.Scalar(Tensor self, Scalar other) -> Tensor @@ -4292,6 +4297,7 @@ dispatch: SparseCPU: sparse_sparse_matmul_cpu SparseCUDA: sparse_sparse_matmul_cuda + SparseMPS: sparse_sparse_matmul_mps autogen: _sparse_sparse_matmul.out - func: mode(Tensor self, int dim=-1, bool keepdim=False) -> (Tensor values, Tensor indices) @@ -4383,7 +4389,7 @@ variants: function, method dispatch: CompositeExplicitAutograd: mv - SparseCPU, SparseCUDA: mv_sparse + SparseCPU, SparseCUDA, SparseMPS: mv_sparse - func: mv.out(Tensor self, Tensor vec, *, Tensor(a!) out) -> Tensor(a!) dispatch: @@ -7512,7 +7518,7 @@ - func: _sparse_mask_projection(Tensor self, Tensor mask, bool accumulate_matches=False) -> Tensor variants: method dispatch: - SparseCPU, SparseCUDA: sparse_mask_projection + SparseCPU, SparseCUDA, SparseMPS: sparse_mask_projection autogen: _sparse_mask_projection.out - func: _to_cpu(Tensor[] tensors) -> Tensor[] @@ -9832,7 +9838,7 @@ structured_delegate: erfinv.out variants: method, function dispatch: - SparseCPU, SparseCUDA: erfinv_sparse + SparseCPU, SparseCUDA, SparseMPS: erfinv_sparse SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: erfinv_sparse_csr tags: pointwise @@ -9841,7 +9847,7 @@ structured_delegate: erfinv.out variants: method dispatch: - SparseCPU, SparseCUDA: erfinv_sparse_ + SparseCPU, SparseCUDA, SparseMPS: erfinv_sparse_ SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: erfinv_sparse_csr_ tags: pointwise @@ -9851,7 +9857,7 @@ structured_inherits: TensorIteratorBase dispatch: CPU, CUDA, MPS: erfinv_out - SparseCPU, SparseCUDA: erfinv_sparse_out + SparseCPU, SparseCUDA, SparseMPS: erfinv_sparse_out SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: erfinv_sparse_csr_out tags: pointwise diff --git a/aten/src/ATen/native/quantized/cpu/qnnpack/buckbuild.bzl b/aten/src/ATen/native/quantized/cpu/qnnpack/buckbuild.bzl index 180442b4b09a4..fecce634ec08c 100644 --- a/aten/src/ATen/native/quantized/cpu/qnnpack/buckbuild.bzl +++ b/aten/src/ATen/native/quantized/cpu/qnnpack/buckbuild.bzl @@ -1,7 +1,7 @@ load("//tools/build_defs:fb_xplat_cxx_library.bzl", "fb_xplat_cxx_library") load("//tools/build_defs:fb_xplat_cxx_test.bzl", "fb_xplat_cxx_test") load("//tools/build_defs:glob_defs.bzl", "subdir_glob") -load("//tools/build_defs:platform_defs.bzl", "ANDROID", "APPLE", "APPLETVOS", "CXX", "IOS", "MACOSX") +load("//tools/build_defs:platform_defs.bzl", "ANDROID", "APPLE", "CXX", "IOS", "MACOSX") # Shared by internal and OSS BUCK def define_qnnpack(third_party, labels = []): @@ -21,7 +21,7 @@ def define_qnnpack(third_party, labels = []): ("src", "requantization/*.h"), ]), header_namespace = "", - apple_sdks = (IOS, MACOSX, APPLETVOS), + apple_sdks = (IOS, MACOSX), compiler_flags = [ "-O2", "-DPYTORCH_QNNPACK_RUNTIME_QUANTIZATION", @@ -82,7 +82,7 @@ def define_qnnpack(third_party, labels = []): ("src", "requantization/*.h"), ]), header_namespace = "", - apple_sdks = (IOS, MACOSX, APPLETVOS), + apple_sdks = (IOS, MACOSX), compiler_flags = [ "-O3", "-ffast-math", @@ -129,7 +129,7 @@ def define_qnnpack(third_party, labels = []): ("src", "requantization/*.h"), ]), header_namespace = "", - apple_sdks = (IOS, MACOSX, APPLETVOS), + apple_sdks = (IOS, MACOSX), compiler_flags = [ "-O3", "-ffast-math", @@ -184,7 +184,7 @@ def define_qnnpack(third_party, labels = []): ("src", "requantization/*.h"), ]), header_namespace = "", - apple_sdks = (IOS, MACOSX, APPLETVOS), + apple_sdks = (IOS, MACOSX), compiler_flags = [ "-O3", "-ffast-math", @@ -236,7 +236,7 @@ def define_qnnpack(third_party, labels = []): ], ), header_namespace = "", - apple_sdks = (IOS, MACOSX, APPLETVOS), + apple_sdks = (IOS, MACOSX), compiler_flags = [ "-DPYTORCH_QNNPACK_RUNTIME_QUANTIZATION", ], @@ -291,7 +291,7 @@ def define_qnnpack(third_party, labels = []): ("src", "qnnpack/*.h"), ("include", "*.h"), ]), - apple_sdks = (IOS, MACOSX, APPLETVOS), + apple_sdks = (IOS, MACOSX), compiler_flags = [ "-O2", "-DPYTORCH_QNNPACK_RUNTIME_QUANTIZATION", @@ -398,7 +398,7 @@ def define_qnnpack(third_party, labels = []): ("src", "requantization/*.h"), ]), header_namespace = "", - apple_sdks = (IOS, MACOSX, APPLETVOS), + apple_sdks = (IOS, MACOSX), compiler_flags = [ "-O3", "-ffast-math", @@ -465,7 +465,7 @@ def define_qnnpack(third_party, labels = []): ("src", "requantization/*.h"), ]), header_namespace = "", - apple_sdks = (IOS, MACOSX, APPLETVOS), + apple_sdks = (IOS, MACOSX), compiler_flags = [ "-DPYTORCH_QNNPACK_RUNTIME_QUANTIZATION", "-Wno-unused-command-line-argument", @@ -525,7 +525,7 @@ def define_qnnpack(third_party, labels = []): ("src", "qnnpack/*.h"), ]), header_namespace = "", - apple_sdks = (IOS, MACOSX, APPLETVOS), + apple_sdks = (IOS, MACOSX), compiler_flags = [ "-O3", "-ffast-math", diff --git a/aten/src/ATen/native/sparse/cuda/SoftMax.cu b/aten/src/ATen/native/sparse/cuda/SoftMax.cu index d39e41c532553..7e3b502bf6f41 100644 --- a/aten/src/ATen/native/sparse/cuda/SoftMax.cu +++ b/aten/src/ATen/native/sparse/cuda/SoftMax.cu @@ -30,10 +30,12 @@ #include #include +#include +#include +#include #include #include #include -#include #include #include @@ -47,6 +49,7 @@ #include #include #include +#include #include #include #include diff --git a/aten/src/ATen/native/sparse/mps/SparseMPSTensorMath.mm b/aten/src/ATen/native/sparse/mps/SparseMPSTensorMath.mm index 5dbee4e38af7b..3da1cb5da53c8 100644 --- a/aten/src/ATen/native/sparse/mps/SparseMPSTensorMath.mm +++ b/aten/src/ATen/native/sparse/mps/SparseMPSTensorMath.mm @@ -10,6 +10,10 @@ #include #else #include +#include +#include +#include +#include #include #include #include @@ -441,6 +445,33 @@ Tensor addmm_sparse_dense_mps( return out; } +static std::tuple mps_intersect_binary_search( + const Tensor& A_keys, + const Tensor& B_keys, + int64_t lenA, + int64_t lenB, + bool boolean_flag) { + + auto stream = getCurrentMPSStream(); + auto outA_idx = at::empty({lenA}, A_keys.options().dtype(at::kLong)); + auto outB_idx = at::empty({lenA}, A_keys.options().dtype(at::kLong)); + auto counter = at::zeros({1}, A_keys.options().dtype(at::kInt)); + + dispatch_sync_with_rethrow(stream->queue(), ^() { + @autoreleasepool { + auto pso = lib.getPipelineStateForFunc("intersect_binary_search"); + auto enc = stream->commandEncoder(); + [enc setComputePipelineState:pso]; + mtl_setArgs(enc, A_keys, B_keys, outA_idx, outB_idx, counter, + static_cast(lenB), boolean_flag); + mtl_dispatch1DJob(enc, pso, static_cast(lenA)); + } + }); + + const auto match_count = static_cast(counter.item()); + return std::make_tuple(std::move(outA_idx), std::move(outB_idx), match_count); +} + SparseTensor& mul_out_sparse_mps(const Tensor& t_, const Tensor& src_, SparseTensor& r_) { TORCH_CHECK(r_.is_mps(), "mul: expected 'out' to be MPS, but got ", r_.device()); @@ -519,22 +550,10 @@ Tensor addmm_sparse_dense_mps( auto A_keys = A_is_lhs ? lhs_keys : rhs_keys; auto B_keys = A_is_lhs ? rhs_keys : lhs_keys; - auto outA_idx = at::empty({lenA}, at::device(device).dtype(kLong)); - auto outB_idx = at::empty({lenA}, at::device(device).dtype(kLong)); - auto counter = at::zeros({1}, at::device(device).dtype(kInt)); + auto [outA_idx, outB_idx, M_int64] = mps_intersect_binary_search( + A_keys, B_keys, lenA, lenB, A_is_lhs); - dispatch_sync_with_rethrow(stream->queue(), ^() { - @autoreleasepool { - auto pso = lib.getPipelineStateForFunc("intersect_binary_search"); - auto enc = stream->commandEncoder(); - [enc setComputePipelineState:pso]; - mtl_setArgs(enc, A_keys, B_keys, outA_idx, outB_idx, counter, - static_cast(lenB), A_is_lhs); - mtl_dispatch1DJob(enc, pso, static_cast(lenA)); - } - }); - - const uint32_t M = counter.item(); // number of structural matches + const auto M = static_cast(M_int64); // number of structural matches r_.resize_as_(lhs); @@ -758,6 +777,14 @@ Tensor addmm_sparse_dense_mps( using OptTensor = std::optional; +static Tensor create_sparse_output_values( + const Tensor& template_values, + int64_t output_nnz, + ScalarType dtype) { + auto out_val_sizes = template_values.sizes().vec(); + out_val_sizes[0] = output_nnz; + return at::zeros(out_val_sizes, template_values.options().dtype(dtype)); +} static void sparse_mask_apply_out_mps_kernel( Tensor& result, @@ -779,9 +806,9 @@ static void sparse_mask_apply_out_mps_kernel( auto src = src_in.coalesce(); auto mask = coalesce_mask ? mask_in.coalesce() : mask_in; - const int64_t src_nnz = src._nnz(); - const int64_t mask_nnz = mask._nnz(); - const int64_t sd = src.sparse_dim(); + const auto src_nnz = src._nnz(); + const auto mask_nnz = mask._nnz(); + const auto sd = src.sparse_dim(); result.sparse_resize_(mask.sizes(), mask.sparse_dim(), mask.dense_dim()); auto commonDtype = at::result_type(src, mask); @@ -810,53 +837,27 @@ static void sparse_mask_apply_out_mps_kernel( return; } + auto mask_indices = mask._indices().contiguous(); + auto src_values = src._values().to(commonDtype).contiguous(); + auto out_values = create_sparse_output_values(src_values, mask_nnz, commonDtype); + if (src_nnz == 0) { - auto out_indices = mask._indices().contiguous(); - auto src_values = src._values().to(commonDtype); - auto out_val_sizes = src_values.sizes().vec(); - out_val_sizes[0] = mask_nnz; - auto out_values = at::zeros(out_val_sizes, src_values.options()); - alias_into_sparse(result, out_indices, out_values); + alias_into_sparse(result, mask_indices, out_values); result._coalesced_(mask.is_coalesced()); return; } - auto mask_indices = mask._indices().contiguous(); - auto src_indices = src._indices().contiguous(); - auto src_values = src._values().to(commonDtype).contiguous(); - - auto mask_keys = flatten_indices(mask_indices, mask.sizes().slice(0, sd)).contiguous(); - auto src_keys = flatten_indices(src_indices, src.sizes().slice(0, sd)).contiguous(); + auto mask_keys = flatten_indices(mask._indices().contiguous(), mask.sizes().slice(0, sd)).contiguous(); + auto src_keys = flatten_indices(src._indices().contiguous(), src.sizes().slice(0, sd)).contiguous(); - const bool A_is_src = (src_nnz <= mask_nnz); - const int64_t lenA = A_is_src ? src_nnz : mask_nnz; - const int64_t lenB = A_is_src ? mask_nnz : src_nnz; + const auto A_is_src = (src_nnz <= mask_nnz); + const auto lenA = A_is_src ? src_nnz : mask_nnz; + const auto lenB = A_is_src ? mask_nnz : src_nnz; auto A_keys = A_is_src ? src_keys : mask_keys; auto B_keys = A_is_src ? mask_keys : src_keys; - const auto device = result.device(); - auto stream = getCurrentMPSStream(); - - auto outA_idx = at::empty({lenA}, at::device(device).dtype(at::kLong)); - auto outB_idx = at::empty({lenA}, at::device(device).dtype(at::kLong)); - auto counter = at::zeros({1}, at::device(device).dtype(at::kInt)); - - dispatch_sync_with_rethrow(stream->queue(), ^() { - @autoreleasepool { - auto pso = lib.getPipelineStateForFunc("intersect_binary_search"); - auto enc = stream->commandEncoder(); - [enc setComputePipelineState:pso]; - mtl_setArgs(enc, A_keys, B_keys, outA_idx, outB_idx, counter, - static_cast(lenB), A_is_src); - mtl_dispatch1DJob(enc, pso, static_cast(lenA)); - } - }); - - const int64_t M = static_cast(counter.item()); - - auto out_val_sizes = src_values.sizes().vec(); - out_val_sizes[0] = mask_nnz; - auto out_values = at::zeros(out_val_sizes, src_values.options()); + auto [outA_idx, outB_idx, M] = mps_intersect_binary_search( + A_keys, B_keys, lenA, lenB, A_is_src); if (M > 0) { auto src_match = outA_idx.narrow(0, 0, M); @@ -874,6 +875,70 @@ static void sparse_mask_apply_out_mps_kernel( result._coalesced_(mask.is_coalesced()); } +static void sparse_mask_projection_out_mps_kernel( + Tensor& result, + const Tensor& lhs, + const Tensor& rhs, + const OptTensor& /*x_hash_opt*/, + bool accumulate_matches) { + + TORCH_CHECK(lhs.is_sparse() && rhs.is_sparse(), "sparse_mask_projection: expected sparse COO"); + TORCH_CHECK(lhs.is_mps() && rhs.is_mps(), "sparse_mask_projection: expected MPS tensors"); + TORCH_CHECK(lhs.sparse_dim() == rhs.sparse_dim(), "sparse_dim mismatch"); + + auto lhs_c = lhs.coalesce(); + auto rhs_c = rhs.coalesce(); + + const auto sd = lhs_c.sparse_dim(); + const auto lhs_nnz = lhs_c._nnz(); + const auto rhs_nnz = rhs_c._nnz(); + + auto commonDtype = at::result_type(lhs_c, rhs_c); + TORCH_CHECK(canCast(commonDtype, result.scalar_type()), + "Can't convert ", commonDtype, " to output ", result.scalar_type()); + + result.sparse_resize_(lhs.sizes(), lhs.sparse_dim(), lhs.dense_dim()); + + auto lhs_indices = lhs_c._indices().contiguous(); + auto rhs_values = rhs_c._values().to(commonDtype).contiguous(); + auto out_values = create_sparse_output_values(rhs_values, lhs_nnz, commonDtype); + + if (lhs_nnz > 0 && rhs_nnz > 0) { + auto lhs_keys = flatten_indices(lhs_indices, lhs_c.sizes().slice(0, sd)).contiguous(); + auto rhs_keys = flatten_indices(rhs_c._indices().contiguous(), rhs_c.sizes().slice(0, sd)).contiguous(); + + const auto A_is_lhs = (lhs_nnz <= rhs_nnz); + const auto lenA = A_is_lhs ? lhs_nnz : rhs_nnz; + const auto lenB = A_is_lhs ? rhs_nnz : lhs_nnz; + auto A_keys = A_is_lhs ? lhs_keys : rhs_keys; + auto B_keys = A_is_lhs ? rhs_keys : lhs_keys; + + auto [outA_idx, outB_idx, M] = mps_intersect_binary_search( + A_keys, B_keys, lenA, lenB, A_is_lhs); + + if (M > 0) { + auto idx_in_A = outA_idx.narrow(0, 0, M); + auto idx_in_B = outB_idx.narrow(0, 0, M); + auto idx_in_lhs = A_is_lhs ? idx_in_A : idx_in_B; + auto idx_in_rhs = A_is_lhs ? idx_in_B : idx_in_A; + + const auto view_cols = rhs_values.numel() / std::max(rhs_nnz, 1); + auto rhs_rows = rhs_values.index_select(0, idx_in_rhs).contiguous(); + auto rhs_rows_2d = rhs_rows.view({M, view_cols}); + auto out_2d = out_values.view({lhs_nnz, view_cols}); + + if (accumulate_matches) { + out_2d.index_add_(0, idx_in_lhs, rhs_rows_2d); + } else { + out_2d.index_copy_(0, idx_in_lhs, rhs_rows_2d); + } + } + } + + alias_into_sparse(result, lhs._indices(), out_values); + result._coalesced_(lhs.is_coalesced()); +} + static void sparse_mask_intersection_out_mps_kernel( Tensor& result, const Tensor& lhs, @@ -888,5 +953,115 @@ static void sparse_mask_intersection_out_mps_kernel( /*coalesce_mask=*/false); } +Tensor sparse_sparse_matmul_mps(const Tensor& mat1_, const Tensor& mat2_) { + TORCH_CHECK(mat1_.is_sparse() && mat2_.is_sparse(), + "sparse_sparse_matmul_mps: both inputs must be sparse COO tensors"); + TORCH_CHECK(mat1_.is_mps() && mat2_.is_mps(), + "sparse_sparse_matmul_mps: both inputs must be on MPS device"); + TORCH_CHECK(mat1_.dim() == 2 && mat2_.dim() == 2, + "sparse_sparse_matmul_mps: both inputs must be 2D matrices"); + TORCH_CHECK(mat1_.dense_dim() == 0 && mat2_.dense_dim() == 0, + "sparse_sparse_matmul_mps: only scalar values supported (dense_dim == 0)"); + TORCH_CHECK(mat1_.size(1) == mat2_.size(0), + "mat1 and mat2 shapes cannot be multiplied (", mat1_.size(0), "x", mat1_.size(1), " and ", mat2_.size(0), "x", mat2_.size(1), ")"); + TORCH_CHECK(mat1_.scalar_type() == mat2_.scalar_type(), + "sparse_sparse_matmul_mps: mat1 dtype ", mat1_.scalar_type(), + " does not match mat2 dtype ", mat2_.scalar_type()); + + const auto device = mat1_.device(); + + auto A = mat1_.coalesce(); + auto B = mat2_.coalesce(); + + const auto I = A.size(0); + const auto K = A.size(1); + const auto N = B.size(1); + + const auto nnzA = A._nnz(); + const auto nnzB = B._nnz(); + + // Early empty result, return an empty, coalesced tensor + if (I == 0 || N == 0 || K == 0 || nnzA == 0 || nnzB == 0) { + auto empty_idx = at::empty({2, 0}, at::device(device).dtype(at::kLong)); + auto empty_val = at::empty({0}, at::device(device).dtype(mat1_.scalar_type())); + auto out = _sparse_coo_tensor_unsafe(empty_idx, empty_val, {I, N}, mat1_.options()); + out._coalesced_(true); + return out; + } + + const auto computeDtype = at::result_type(mat1_, mat2_); + + auto A_idx = A._indices().contiguous(); + auto A_val = A._values().to(computeDtype).contiguous(); + auto A_i = A_idx.select(0, 0).contiguous(); + auto A_k = A_idx.select(0, 1).contiguous(); + + auto B_idx = B._indices().contiguous(); + auto B_val = B._values().to(computeDtype).contiguous(); + auto B_k = B_idx.select(0, 0).contiguous(); + auto B_j = B_idx.select(0, 1).contiguous(); + + // csr-style row pointers for B by k (the shared dimension) + Tensor row_ptr_B; + { + auto batch_ptr = at::tensor({0LL, nnzB}, at::device(device).dtype(at::kLong)); + row_ptr_B = at::empty({K + 1}, at::device(device).dtype(at::kLong)); + build_row_ptr_per_batch_mps(B_k, batch_ptr, /*B=*/1, /*I=*/K, row_ptr_B); + } + + auto row_ptr_B_lo = row_ptr_B.narrow(0, 0, K); + auto row_ptr_B_hi = row_ptr_B.narrow(0, 1, K); + auto deg_B = row_ptr_B_hi.sub(row_ptr_B_lo); + + auto counts = deg_B.index_select(0, A_k); + + const int64_t P = counts.sum().item(); + if (P == 0) { + auto empty_idx = at::empty({2, 0}, at::device(device).dtype(at::kLong)); + auto empty_val = at::empty({0}, at::device(device).dtype(mat1_.scalar_type())); + auto out = _sparse_coo_tensor_unsafe(empty_idx, empty_val, {I, N}, mat1_.options()); + out._coalesced_(true); + return out; + } + + auto group_ids = repeat_interleave_mps(counts); + + // exclusive cumsum of counts + auto offsets = cumsum(counts, /*dim=*/0).sub(counts); + auto offsets_gather = offsets.index_select(0, group_ids); + auto within = at::arange(P, at::device(device).dtype(at::kLong)).sub(offsets_gather); + + // Map each output element to its source B row and position + auto k_per_out = A_k.index_select(0, group_ids); + auto start_in_B = row_ptr_B.index_select(0, k_per_out); + auto seg_index = start_in_B.add(within); + + // Assemble candidate coo pairs and values + auto i_out = A_i.index_select(0, group_ids).contiguous(); + auto j_out = B_j.index_select(0, seg_index).contiguous(); + auto vA_out = A_val.index_select(0, group_ids).contiguous(); + auto vB_out = B_val.index_select(0, seg_index).contiguous(); + auto v_out = vA_out.mul(vB_out); + + // build (2, P) indices + auto out_indices = at::empty({2, P}, at::device(device).dtype(at::kLong)).contiguous(); + out_indices.select(0, 0).copy_(i_out); + out_indices.select(0, 1).copy_(j_out); + + auto result = _sparse_coo_tensor_unsafe( + out_indices, v_out, {I, N}, mat1_.options().dtype(computeDtype)); + + result = result.coalesce(); + + if (result.scalar_type() != mat1_.scalar_type()) { + auto cast_vals = result._values().to(mat1_.scalar_type()); + auto out = _sparse_coo_tensor_unsafe(result._indices(), cast_vals, {I, N}, mat1_.options()); + out._coalesced_(true); + return out; + } + return result; +} + REGISTER_MPS_DISPATCH(sparse_mask_intersection_out_stub, &sparse_mask_intersection_out_mps_kernel); +REGISTER_MPS_DISPATCH(sparse_mask_projection_out_stub, &sparse_mask_projection_out_mps_kernel); } // namespace at::native \ No newline at end of file diff --git a/aten/src/ATen/native/transformers/cuda/sdp_utils.cpp b/aten/src/ATen/native/transformers/cuda/sdp_utils.cpp index 7fce73151b00f..a6742a7cb9e78 100644 --- a/aten/src/ATen/native/transformers/cuda/sdp_utils.cpp +++ b/aten/src/ATen/native/transformers/cuda/sdp_utils.cpp @@ -478,7 +478,7 @@ bool check_cudnn_tensor_shapes(sdp_params const& params, bool debug) { const auto s_k = params.key.sym_size(2); const auto d_qk = params.query.sym_size(3); const auto d_v = params.value.sym_size(3); - long cudnn_version = at::detail::getCUDAHooks().versionCuDNN(); + long cudnn_version = at::detail::getCUDAHooks().versionRuntimeCuDNN(); if (cudnn_version < 8903) { if (debug) { TORCH_WARN("SDPA fprop requires cudnn 8.9.3 or higher"); @@ -709,7 +709,7 @@ bool can_use_cudnn_attention(const sdp_params& params, bool debug) { return false; #endif #if defined(CUDNN_VERSION) - static auto cudnn_version = cudnnGetVersion(); + static auto cudnn_version = at::detail::getCUDAHooks().versionRuntimeCuDNN(); if (params.dropout > 0.0 && cudnn_version > 91100 && cudnn_version < 91400) { if (debug) { TORCH_WARN(CUDNN_VERSION, " cuDNN version does not support droppout in SDPA (9.11 - 9.13)."); diff --git a/aten/src/ATen/test/CMakeLists.txt b/aten/src/ATen/test/CMakeLists.txt index 81b3ce90b36bf..a522e7ab76cf4 100644 --- a/aten/src/ATen/test/CMakeLists.txt +++ b/aten/src/ATen/test/CMakeLists.txt @@ -61,6 +61,7 @@ list(APPEND ATen_CUDA_TEST_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/cuda_complex_math_test.cu ${CMAKE_CURRENT_SOURCE_DIR}/cuda_complex_test.cu ${CMAKE_CURRENT_SOURCE_DIR}/cuda_cub_test.cu + ${CMAKE_CURRENT_SOURCE_DIR}/cuda_cublas_handle_pool_test.cpp ${CMAKE_CURRENT_SOURCE_DIR}/cuda_device_test.cpp ${CMAKE_CURRENT_SOURCE_DIR}/cuda_distributions_test.cu ${CMAKE_CURRENT_SOURCE_DIR}/cuda_dlconvertor_test.cpp diff --git a/aten/src/ATen/test/cuda_cublas_handle_pool_test.cpp b/aten/src/ATen/test/cuda_cublas_handle_pool_test.cpp new file mode 100644 index 0000000000000..535bb3d1cc2ea --- /dev/null +++ b/aten/src/ATen/test/cuda_cublas_handle_pool_test.cpp @@ -0,0 +1,77 @@ +#include + +#include +#include +#include + +#include +#include +#include + +// Test concurrent access to getCurrentCUDABlasHandle and getCUDABlasLtWorkspace +// to verify that the data race fix is working correctly + +TEST(CUDABlasHandlePoolTest, ConcurrentGetAndClearWorkspaces) { + if (!at::cuda::is_available()) { + return; + } + + constexpr int num_accessor_threads = 15; + constexpr int num_clear_threads = 5; + constexpr int iterations_per_thread = 50; + + std::atomic stop{false}; + std::atomic error_count{0}; + std::vector threads; + threads.reserve(num_accessor_threads + num_clear_threads); + + // Launch accessor threads + for (int i = 0; i < num_accessor_threads; ++i) { + threads.emplace_back([&stop, &error_count]() { + try { + at::cuda::CUDAGuard device_guard(0); + + while (!stop.load(std::memory_order_relaxed)) { + const auto handle = at::cuda::getCurrentCUDABlasHandle(); + const auto workspace = at::cuda::getCUDABlasLtWorkspace(); + + if (handle == nullptr || workspace == nullptr) { + error_count++; + } + } + } catch (const std::exception& e) { + error_count++; + } + }); + } + + // Launch threads that clear workspaces + for (int i = 0; i < num_clear_threads; ++i) { + threads.emplace_back([&error_count]() { + try { + for (int j = 0; j < iterations_per_thread; ++j) { + at::cuda::clearCublasWorkspaces(); + std::this_thread::yield(); + } + } catch (const std::exception& e) { + error_count++; + } + }); + } + + // Let them run for a bit + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + stop.store(true, std::memory_order_relaxed); + + for (auto& thread : threads) { + thread.join(); + } + + EXPECT_EQ(error_count.load(), 0); +} + +int main(int argc, char* argv[]) { + ::testing::InitGoogleTest(&argc, argv); + c10::cuda::CUDACachingAllocator::init(1); + return RUN_ALL_TESTS(); +} diff --git a/aten/src/ATen/xpu/XPUEvent.h b/aten/src/ATen/xpu/XPUEvent.h index 19d42aae080f1..f33fd70ac0619 100644 --- a/aten/src/ATen/xpu/XPUEvent.h +++ b/aten/src/ATen/xpu/XPUEvent.h @@ -1,191 +1,3 @@ #pragma once #include - -#include - -namespace at::xpu { - -/* - * XPUEvent are movable not copyable wrappers around SYCL event. XPUEvent are - * constructed lazily when first recorded. It has a device, and this device is - * acquired from the first recording stream. Later streams that record the event - * must match the same device. - * - * Currently, XPUEvent does NOT support to export an inter-process event from - * another process via inter-process communication(IPC). So it means that - * inter-process communication for event handles between different processes is - * not available. This could impact some applications that rely on cross-process - * synchronization and communication. - */ -struct TORCH_XPU_API XPUEvent { - // Constructors - XPUEvent(bool enable_timing = false) noexcept - : enable_timing_{enable_timing} {} - - ~XPUEvent() { - if (isCreated()) { - const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace(); - if (C10_UNLIKELY(interp)) { - (*interp)->trace_gpu_event_deletion( - at::kXPU, reinterpret_cast(event_.get())); - } - } - } - - XPUEvent(const XPUEvent&) = delete; - XPUEvent& operator=(const XPUEvent&) = delete; - - XPUEvent(XPUEvent&& other) = default; - XPUEvent& operator=(XPUEvent&& other) = default; - - operator sycl::event&() const { - return event(); - } - - std::optional device() const { - if (isCreated()) { - return at::Device(at::kXPU, device_index_); - } else { - return std::nullopt; - } - } - - inline bool isCreated() const { - return (event_.get() != nullptr); - } - - DeviceIndex device_index() const { - return device_index_; - } - - sycl::event& event() const { - return *event_; - } - - bool query() const { - using namespace sycl::info; - if (!isCreated()) { - return true; - } - - return event().get_info() == - event_command_status::complete; - } - - void record() { - record(getCurrentXPUStream()); - } - - void recordOnce(const XPUStream& stream) { - if (!isCreated()) { - record(stream); - } - } - - void record(const XPUStream& stream) { - if (!isCreated()) { - device_index_ = stream.device_index(); - assignEvent(stream.queue()); - const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace(); - if (C10_UNLIKELY(interp)) { - (*interp)->trace_gpu_event_creation( - at::kXPU, reinterpret_cast(event_.get())); - } - } else { - TORCH_CHECK( - device_index_ == stream.device_index(), - "Event device ", - device_index_, - " does not match recording stream's device ", - stream.device_index(), - "."); - reassignEvent(stream.queue()); - } - const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace(); - if (C10_UNLIKELY(interp)) { - (*interp)->trace_gpu_event_record( - at::kXPU, - reinterpret_cast(event_.get()), - reinterpret_cast(&stream.queue())); - } - } - - void block(const XPUStream& stream) { - if (isCreated()) { - std::vector event_list{event()}; - // Make this stream wait until event_ is completed. - stream.queue().ext_oneapi_submit_barrier(event_list); - const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace(); - if (C10_UNLIKELY(interp)) { - (*interp)->trace_gpu_event_wait( - at::kXPU, - reinterpret_cast(event_.get()), - reinterpret_cast(&stream.queue())); - } - } - } - - double elapsed_time(const XPUEvent& other) const { - TORCH_CHECK( - isCreated() && other.isCreated(), - "Both events must be recorded before calculating elapsed time."); - TORCH_CHECK( - query() && other.query(), - "Both events must be completed before calculating elapsed time."); - TORCH_CHECK( - enable_timing_ && other.enable_timing_, - "Both events must be created with argument 'enable_timing=True'."); - -#if SYCL_COMPILER_VERSION < 20250000 - TORCH_CHECK_NOT_IMPLEMENTED( - false, - "elapsed_time of XPUEvent requires PyTorch to be built with SYCL compiler version 2025.0.0 or newer."); -#endif - - using namespace sycl::info::event_profiling; - // Block until both of the recorded events are completed. - uint64_t end_time_ns = other.event().get_profiling_info(); - uint64_t start_time_ns = event().get_profiling_info(); - // Return the eplased time in milliseconds. - return 1e-6 * - (static_cast(end_time_ns) - static_cast(start_time_ns)); - } - - void synchronize() const { - if (isCreated()) { - const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace(); - if (C10_UNLIKELY(interp)) { - (*interp)->trace_gpu_event_synchronization( - at::kXPU, reinterpret_cast(event_.get())); - } - event().wait_and_throw(); - } - } - - private: - void assignEvent(sycl::queue& queue) { -#if SYCL_COMPILER_VERSION >= 20250000 - if (enable_timing_) { - event_ = std::make_unique( - sycl::ext::oneapi::experimental::submit_profiling_tag(queue)); - } else { - event_ = std::make_unique(queue.ext_oneapi_submit_barrier()); - } -#else - event_ = std::make_unique(queue.ext_oneapi_submit_barrier()); -#endif - } - - void reassignEvent(sycl::queue& queue) { - event_.reset(); - assignEvent(queue); - } - - bool enable_timing_ = false; - DeviceIndex device_index_ = -1; - // Only need to track the last event, as events in an in-order queue are - // executed sequentially. - std::unique_ptr event_; -}; - -} // namespace at::xpu +#include diff --git a/benchmarks/dynamo/check_accuracy.py b/benchmarks/dynamo/check_accuracy.py index 83cca8b36b993..7f8be84b93fd7 100644 --- a/benchmarks/dynamo/check_accuracy.py +++ b/benchmarks/dynamo/check_accuracy.py @@ -50,6 +50,7 @@ def check_accuracy(actual_csv, expected_csv, expected_filename): "mobilenet_v2", "pytorch_CycleGAN_and_pix2pix", "pytorch_stargan", + "repvgg_a2", "resnet152", "resnet18", "resnet50", diff --git a/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_timm_training.csv b/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_timm_training.csv index b5e457e58997d..b2f40504a4991 100644 --- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_timm_training.csv +++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_timm_training.csv @@ -10,7 +10,7 @@ beit_base_patch16_224,pass,7 -convnextv2_nano.fcmae_ft_in22k_in1k,pass,7 +convnextv2_nano.fcmae_ft_in22k_in1k,fail_accuracy,7 @@ -66,7 +66,7 @@ visformer_small,pass,7 -vit_base_patch14_dinov2.lvd142m,pass,7 +vit_base_patch14_dinov2.lvd142m,fail_accuracy,7 diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_inductor_timm_training.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_inductor_timm_training.csv index b2071874b70d6..2d087e6595526 100644 --- a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_inductor_timm_training.csv +++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_inductor_timm_training.csv @@ -50,7 +50,7 @@ nfnet_l0,pass,7 -repvgg_a2,fail_accuracy,7 +repvgg_a2,pass,7 diff --git a/benchmarks/dynamo/common.py b/benchmarks/dynamo/common.py index e0681f52586e7..a3bd58c4de747 100644 --- a/benchmarks/dynamo/common.py +++ b/benchmarks/dynamo/common.py @@ -952,7 +952,7 @@ def latency_experiment_summary(suite_name, args, model, timings, **kwargs): first_fields.append(kwargs["tag"]) headers = first_headers + ["speedup", "abs_latency"] row = first_fields + [float(speedup), median[1] * 1000] - msg = f"{speedup:.3f}x" + msg = f"{median[0] * 1000} ms, {median[1] * 1000} ms, {speedup:.3f}x" if args.baseline: headers.extend( [ @@ -1010,7 +1010,7 @@ def latency_experiment_summary(suite_name, args, model, timings, **kwargs): # Hypothetically you can use this from other places, but it's currently # inaccessible, and when this assert fails you need to update the # event_name here to account for the other cases you are using this - assert args.quantization is not None + assert any([args.quantization, args.optimus]) output_signpost( dict(zip(headers, row)), args, @@ -2288,11 +2288,9 @@ def record_status(accuracy_status, dynamo_start_stats): ) ): is_same = False - except Exception as e: + except Exception: # Sometimes torch.allclose may throw RuntimeError - exception_string = str(e) - accuracy_status = f"fail_exception: {exception_string}" - return record_status(accuracy_status, dynamo_start_stats=start_stats) + is_same = False if not is_same: accuracy_status = "eager_two_runs_differ" @@ -2409,11 +2407,9 @@ def record_status(accuracy_status, dynamo_start_stats): force_max_multiplier=force_max_multiplier, ): is_same = False - except Exception as e: + except Exception: # Sometimes torch.allclose may throw RuntimeError - exception_string = str(e) - accuracy_status = f"fail_exception: {exception_string}" - return record_status(accuracy_status, dynamo_start_stats=start_stats) + is_same = False if not is_same: if self.args.skip_accuracy_check: @@ -2587,6 +2583,9 @@ def warmup(fn, model, example_inputs, mode, niters=10): **experiment_kwargs, ) + # reset dynamo + torch._dynamo.reset() + if self.args.export_aot_inductor: optimized_model_iter_fn = optimize_ctx else: @@ -2950,7 +2949,7 @@ def run_one_model( status = self.check_tolerance(name, model, example_inputs, optimize_ctx) print(status) elif self.args.performance: - if self.args.backend == "torchao": + if self.args.backend in ["torchao", "optimus"]: status = self.run_performance_test_non_alternate( name, model, example_inputs, optimize_ctx, experiment, tag ) @@ -3526,6 +3525,12 @@ def get_example_inputs(self): action="store_true", help="Measure speedup with TorchInductor", ) + group.add_argument( + "--optimus", + choices=["vertical_opt", "horizontal_opt", "all"], + default=None, + help="Measure speedup of Optimus with TorchInductor baseline", + ) group.add_argument( "--quantization", choices=[ @@ -3783,6 +3788,9 @@ def run(runner, args, original_dir=None): if args.inductor: assert args.backend is None args.backend = "inductor" + if args.optimus: + assert args.backend is None + args.backend = "optimus" if args.quantization: assert args.backend is None args.backend = "torchao" @@ -4067,10 +4075,22 @@ def model_iter_fn_and_mark_step(*args, **kwargs): runner.model_iter_fn = model_iter_fn_and_mark_step optimize_ctx = torchao_optimize_ctx(args.quantization) + elif args.backend == "optimus": + from .optimus import get_baseline_ctx, get_optimus_optimize_ctx + + baseline_ctx = get_baseline_ctx( + nopython=args.nopython, inductor_compile_mode=args.inductor_compile_mode + ) + runner.model_iter_fn = baseline_ctx(runner.model_iter_fn) + optimize_ctx = get_optimus_optimize_ctx( + args.optimus, args.nopython, args.inductor_compile_mode + ) else: optimize_ctx = torch._dynamo.optimize(args.backend, nopython=args.nopython) experiment = ( - speedup_experiment if args.backend != "torchao" else latency_experiment + speedup_experiment + if args.backend not in ["torchao", "optimus"] + else latency_experiment ) if args.accuracy: output_filename = f"accuracy_{args.backend}.csv" @@ -4091,7 +4111,12 @@ def model_iter_fn_and_mark_step(*args, **kwargs): if args.only in runner.disable_cudagraph_models: args.disable_cudagraphs = True - if args.inductor or args.backend == "inductor" or args.export_aot_inductor: + if ( + args.inductor + or args.backend == "inductor" + or args.export_aot_inductor + or args.backend == "optimus" + ): inductor_config.triton.cudagraphs = not args.disable_cudagraphs inductor_config.triton.persistent_reductions = ( not args.disable_persistent_reductions diff --git a/benchmarks/dynamo/optimus.py b/benchmarks/dynamo/optimus.py new file mode 100644 index 0000000000000..f188b698edd5f --- /dev/null +++ b/benchmarks/dynamo/optimus.py @@ -0,0 +1,62 @@ +import functools + +import torch + + +def get_baseline_ctx(nopython, inductor_compile_mode): + return functools.partial( + torch.compile, + backend="inductor", + fullgraph=nopython, + mode=inductor_compile_mode, + ) + + +def get_optimus_optimize_ctx(config, nopython, inductor_compile_mode): + if config == "vertical_opt": + optimus_inductor_config = { + "pre_grad_fusion_options": { + "normalization_pass": {}, + "merge_splits_pass": {}, + "split_cat_pass": {}, + "unbind_stack_pass": {}, + "unbind_cat_to_view_pass": {}, + } + } + elif config == "horizontal_opt": + optimus_inductor_config = { + "pre_grad_fusion_options": { + "normalization_pass": {}, + "batch_linear": {}, + "batch_layernorm": {}, + }, + } + elif config == "all": + optimus_inductor_config = { + "pre_grad_fusion_options": { + "normalization_pass": {}, + "batch_linear": {}, + "batch_layernorm": {}, + "merge_splits_pass": {}, + "split_cat_pass": {}, + "unbind_stack_pass": {}, + "unbind_cat_to_view_pass": {}, + }, + } + else: + raise RuntimeError(f"Unknown optimus config: {config}") + + def _inner(fn): + if "pre_grad_fusion_options" in optimus_inductor_config: + torch._inductor.config.pre_grad_fusion_options = optimus_inductor_config[ + "pre_grad_fusion_options" + ] + if "post_grad_fusion_options" in optimus_inductor_config: + torch._inductor.config.post_grad_fusion_options = optimus_inductor_config[ + "post_grad_fusion_options" + ] + return torch.compile( + fn, backend="inductor", fullgraph=nopython, mode=inductor_compile_mode + ) + + return _inner diff --git a/benchmarks/dynamo/parse_logs.py b/benchmarks/dynamo/parse_logs.py index 8704fda9b997a..a3def611bbcc2 100644 --- a/benchmarks/dynamo/parse_logs.py +++ b/benchmarks/dynamo/parse_logs.py @@ -2,6 +2,7 @@ import os import re import sys +from pathlib import Path # This script takes the logs produced by the benchmark scripts (e.g., @@ -15,8 +16,7 @@ # This script is not very well written, feel free to rewrite it as necessary assert len(sys.argv) == 2 - -full_log = open(sys.argv[1]).read() +full_log = Path(sys.argv[1]).read_text() # If the log contains a gist URL, extract it so we can include it in the CSV gist_url = "" diff --git a/benchmarks/dynamo/pr_time_benchmarks/benchmarks/dtensor.py b/benchmarks/dynamo/pr_time_benchmarks/benchmarks/dtensor.py new file mode 100644 index 0000000000000..db59dfacb3f82 --- /dev/null +++ b/benchmarks/dynamo/pr_time_benchmarks/benchmarks/dtensor.py @@ -0,0 +1,62 @@ +import sys + +from benchmark_base import BenchmarkBase + +import torch +from torch.distributed._tensor import DTensor, Replicate +from torch.testing._internal.distributed.fake_pg import FakeStore + + +class BenchmarkDTensorDispatch(BenchmarkBase): + def __init__(self, operator, world_size) -> None: + super().__init__( + category=f"dtensor_dispatch_{operator}", + device="cuda", + ) + self.world_size = world_size + + def name(self) -> str: + prefix = f"{self.category()}" + return prefix + + def description(self) -> str: + return f"DTensor dispatch time for {self.category()}" + + def _prepare_once(self) -> None: + self.mesh = torch.distributed.device_mesh.init_device_mesh( + "cuda", (self.world_size,), mesh_dim_names=("dp",) + ) + self.a = DTensor.from_local( + torch.ones(10, 10, device=self.device()), self.mesh, [Replicate()] + ) + self.b = DTensor.from_local( + torch.ones(10, 10, device=self.device()), self.mesh, [Replicate()] + ) + + def _prepare(self) -> None: + pass + + +class BenchmarkDetach(BenchmarkDTensorDispatch): + def __init__(self, world_size) -> None: + super().__init__(operator="detach", world_size=world_size) + + def _work(self) -> None: + self.a.detach() + + +def main(): + world_size = 256 + fake_store = FakeStore() + torch.distributed.init_process_group( + "fake", store=fake_store, rank=0, world_size=world_size + ) + result_path = sys.argv[1] + BenchmarkDetach(world_size).enable_instruction_count().collect_all().append_results( + result_path + ) + torch.distributed.destroy_process_group() + + +if __name__ == "__main__": + main() diff --git a/benchmarks/operator_benchmark/aarch64_expected_ci_operator_benchmark_eager_float32_cpu.csv b/benchmarks/operator_benchmark/aarch64_expected_ci_operator_benchmark_eager_float32_cpu.csv index dc8b240ce570f..f3d8c7e65af04 100644 --- a/benchmarks/operator_benchmark/aarch64_expected_ci_operator_benchmark_eager_float32_cpu.csv +++ b/benchmarks/operator_benchmark/aarch64_expected_ci_operator_benchmark_eager_float32_cpu.csv @@ -484,24 +484,106 @@ PyTorch,sum,sum_R256_V512_dim0_contiguousTrue_cpu,short,False,50.954394,0.000000 PyTorch,sum,sum_R256_V512_dim0_contiguousFalse_cpu,short,False,57.957757,0.000000 PyTorch,sum,sum_R256_V512_dim1_contiguousTrue_cpu,short,False,53.592068,0.000000 PyTorch,sum,sum_R256_V512_dim1_contiguousFalse_cpu,short,False,51.339726,0.000000 -PyTorch,FloatToHalfTensorConversionBenchmark,FloatToHalfTensorConversionBenchmark_M8_N16_cpu,short,False,7.040985,0.000000 -PyTorch,FloatToHalfTensorConversionBenchmark,FloatToHalfTensorConversionBenchmark_M8_N64_cpu,short,False,7.168604,0.000000 -PyTorch,FloatToHalfTensorConversionBenchmark,FloatToHalfTensorConversionBenchmark_M8_N128_cpu,short,False,7.434442,0.000000 -PyTorch,FloatToHalfTensorConversionBenchmark,FloatToHalfTensorConversionBenchmark_M16_N16_cpu,short,False,7.078318,0.000000 -PyTorch,FloatToHalfTensorConversionBenchmark,FloatToHalfTensorConversionBenchmark_M16_N64_cpu,short,False,7.426670,0.000000 -PyTorch,FloatToHalfTensorConversionBenchmark,FloatToHalfTensorConversionBenchmark_M16_N128_cpu,short,False,7.679027,0.000000 -PyTorch,FloatToHalfTensorConversionBenchmark,FloatToHalfTensorConversionBenchmark_M32_N16_cpu,short,False,7.281365,0.000000 -PyTorch,FloatToHalfTensorConversionBenchmark,FloatToHalfTensorConversionBenchmark_M32_N64_cpu,short,False,7.682783,0.000000 -PyTorch,FloatToHalfTensorConversionBenchmark,FloatToHalfTensorConversionBenchmark_M32_N128_cpu,short,False,8.381938,0.000000 -PyTorch,HalfToFloatTensorConversionBenchmark,HalfToFloatTensorConversionBenchmark_M8_N16_cpu,short,False,7.039854,0.000000 -PyTorch,HalfToFloatTensorConversionBenchmark,HalfToFloatTensorConversionBenchmark_M8_N64_cpu,short,False,7.399855,0.000000 -PyTorch,HalfToFloatTensorConversionBenchmark,HalfToFloatTensorConversionBenchmark_M8_N128_cpu,short,False,7.715193,0.000000 -PyTorch,HalfToFloatTensorConversionBenchmark,HalfToFloatTensorConversionBenchmark_M16_N16_cpu,short,False,7.255140,0.000000 -PyTorch,HalfToFloatTensorConversionBenchmark,HalfToFloatTensorConversionBenchmark_M16_N64_cpu,short,False,7.753522,0.000000 -PyTorch,HalfToFloatTensorConversionBenchmark,HalfToFloatTensorConversionBenchmark_M16_N128_cpu,short,False,8.364281,0.000000 -PyTorch,HalfToFloatTensorConversionBenchmark,HalfToFloatTensorConversionBenchmark_M32_N16_cpu,short,False,7.476377,0.000000 -PyTorch,HalfToFloatTensorConversionBenchmark,HalfToFloatTensorConversionBenchmark_M32_N64_cpu,short,False,8.458564,0.000000 -PyTorch,HalfToFloatTensorConversionBenchmark,HalfToFloatTensorConversionBenchmark_M32_N128_cpu,short,False,9.391939,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bool_dtype_twotorch.bool,short,False,0.927,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bool_dtype_twotorch.uint8,short,False,6.261,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bool_dtype_twotorch.int8,short,False,6.351,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bool_dtype_twotorch.int16,short,False,6.177,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bool_dtype_twotorch.int32,short,False,6.333,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bool_dtype_twotorch.int64,short,False,6.588,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bool_dtype_twotorch.float16,short,False,8.117,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bool_dtype_twotorch.bfloat16,short,False,9.358,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bool_dtype_twotorch.float32,short,False,7.844,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bool_dtype_twotorch.float64,short,False,8.097,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.uint8_dtype_twotorch.bool,short,False,6.159,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.uint8_dtype_twotorch.uint8,short,False,0.926,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.uint8_dtype_twotorch.int8,short,False,6.192,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.uint8_dtype_twotorch.int16,short,False,6.276,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.uint8_dtype_twotorch.int32,short,False,6.461,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.uint8_dtype_twotorch.int64,short,False,6.524,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.uint8_dtype_twotorch.float16,short,False,8.136,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.uint8_dtype_twotorch.bfloat16,short,False,6.854,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.uint8_dtype_twotorch.float32,short,False,6.446,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.uint8_dtype_twotorch.float64,short,False,6.829,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int8_dtype_twotorch.bool,short,False,6.088,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int8_dtype_twotorch.uint8,short,False,6.059,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int8_dtype_twotorch.int8,short,False,0.922,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int8_dtype_twotorch.int16,short,False,6.263,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int8_dtype_twotorch.int32,short,False,6.330,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int8_dtype_twotorch.int64,short,False,6.688,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int8_dtype_twotorch.float16,short,False,8.176,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int8_dtype_twotorch.bfloat16,short,False,6.959,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int8_dtype_twotorch.float32,short,False,6.430,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int8_dtype_twotorch.float64,short,False,6.818,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int16_dtype_twotorch.bool,short,False,6.350,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int16_dtype_twotorch.uint8,short,False,6.221,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int16_dtype_twotorch.int8,short,False,6.193,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int16_dtype_twotorch.int16,short,False,0.922,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int16_dtype_twotorch.int32,short,False,6.263,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int16_dtype_twotorch.int64,short,False,6.525,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int16_dtype_twotorch.float16,short,False,7.960,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int16_dtype_twotorch.bfloat16,short,False,6.801,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int16_dtype_twotorch.float32,short,False,6.594,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int16_dtype_twotorch.float64,short,False,7.089,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int32_dtype_twotorch.bool,short,False,6.498,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int32_dtype_twotorch.uint8,short,False,6.358,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int32_dtype_twotorch.int8,short,False,6.390,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int32_dtype_twotorch.int16,short,False,6.415,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int32_dtype_twotorch.int32,short,False,0.925,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int32_dtype_twotorch.int64,short,False,6.657,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int32_dtype_twotorch.float16,short,False,7.954,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int32_dtype_twotorch.bfloat16,short,False,6.930,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int32_dtype_twotorch.float32,short,False,6.737,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int32_dtype_twotorch.float64,short,False,6.948,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int64_dtype_twotorch.bool,short,False,6.757,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int64_dtype_twotorch.uint8,short,False,6.402,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int64_dtype_twotorch.int8,short,False,6.550,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int64_dtype_twotorch.int16,short,False,6.518,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int64_dtype_twotorch.int32,short,False,6.766,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int64_dtype_twotorch.int64,short,False,0.929,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int64_dtype_twotorch.float16,short,False,8.557,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int64_dtype_twotorch.bfloat16,short,False,9.045,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int64_dtype_twotorch.float32,short,False,7.672,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int64_dtype_twotorch.float64,short,False,7.276,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float16_dtype_twotorch.bool,short,False,6.414,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float16_dtype_twotorch.uint8,short,False,7.736,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float16_dtype_twotorch.int8,short,False,7.889,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float16_dtype_twotorch.int16,short,False,8.170,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float16_dtype_twotorch.int32,short,False,7.783,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float16_dtype_twotorch.int64,short,False,7.743,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float16_dtype_twotorch.float16,short,False,0.927,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float16_dtype_twotorch.bfloat16,short,False,7.018,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float16_dtype_twotorch.float32,short,False,8.428,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float16_dtype_twotorch.float64,short,False,6.767,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bfloat16_dtype_twotorch.bool,short,False,6.479,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bfloat16_dtype_twotorch.uint8,short,False,7.827,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bfloat16_dtype_twotorch.int8,short,False,6.450,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bfloat16_dtype_twotorch.int16,short,False,6.320,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bfloat16_dtype_twotorch.int32,short,False,6.385,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bfloat16_dtype_twotorch.int64,short,False,8.119,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bfloat16_dtype_twotorch.float16,short,False,8.063,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bfloat16_dtype_twotorch.bfloat16,short,False,0.925,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bfloat16_dtype_twotorch.float32,short,False,8.629,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bfloat16_dtype_twotorch.float64,short,False,6.638,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float32_dtype_twotorch.bool,short,False,6.425,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float32_dtype_twotorch.uint8,short,False,7.803,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float32_dtype_twotorch.int8,short,False,6.502,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float32_dtype_twotorch.int16,short,False,6.429,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float32_dtype_twotorch.int32,short,False,6.549,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float32_dtype_twotorch.int64,short,False,7.749,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float32_dtype_twotorch.float16,short,False,7.301,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float32_dtype_twotorch.bfloat16,short,False,7.682,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float32_dtype_twotorch.float32,short,False,0.930,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float32_dtype_twotorch.float64,short,False,6.738,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float64_dtype_twotorch.bool,short,False,6.798,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float64_dtype_twotorch.uint8,short,False,6.506,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float64_dtype_twotorch.int8,short,False,6.494,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float64_dtype_twotorch.int16,short,False,6.668,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float64_dtype_twotorch.int32,short,False,6.696,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float64_dtype_twotorch.int64,short,False,7.115,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float64_dtype_twotorch.float16,short,False,7.910,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float64_dtype_twotorch.bfloat16,short,False,7.410,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float64_dtype_twotorch.float32,short,False,6.868,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float64_dtype_twotorch.float64,short,False,0.924,0.000000 PyTorch,addcmul,addcmul_M1_N2_cpu_dtypetorch.float32,short,False,4.461410,0.000000 PyTorch,addcmul,addcmul_M1_N2_cpu_dtypetorch.bfloat16,short,False,4.560082,0.000000 PyTorch,addcmul,addcmul_M32_N64_cpu_dtypetorch.float32,short,False,5.141248,0.000000 diff --git a/benchmarks/operator_benchmark/pt/addmm_test.py b/benchmarks/operator_benchmark/pt/addmm_test.py index a98628944b3e8..3e94a9cd7f3dc 100644 --- a/benchmarks/operator_benchmark/pt/addmm_test.py +++ b/benchmarks/operator_benchmark/pt/addmm_test.py @@ -53,10 +53,8 @@ def forward(self, input_one, mat1, mat2): return torch.addmm(input_one, mat1, mat2) -op_bench.generate_pt_test(addmm_long_configs + addmm_long_configs, AddmmBenchmark) -op_bench.generate_pt_gradient_test( - addmm_long_configs + addmm_long_configs, AddmmBenchmark -) +op_bench.generate_pt_test(addmm_short_configs + addmm_long_configs, AddmmBenchmark) +op_bench.generate_pt_gradient_test(addmm_long_configs, AddmmBenchmark) """Mircobenchmark for addbmm operator.""" @@ -107,9 +105,7 @@ def forward(self, input_one, batch1, batch2): ) op_bench.generate_pt_test(addbmm_long_configs + addbmm_short_configs, AddbmmBenchmark) -op_bench.generate_pt_gradient_test( - addbmm_long_configs + addbmm_short_configs, AddbmmBenchmark -) +op_bench.generate_pt_gradient_test(addbmm_long_configs, AddbmmBenchmark) if __name__ == "__main__": op_bench.benchmark_runner.main() diff --git a/benchmarks/operator_benchmark/pt/tensor_to_test.py b/benchmarks/operator_benchmark/pt/tensor_to_test.py index 621e58212cba2..9354c8c52eaa8 100644 --- a/benchmarks/operator_benchmark/pt/tensor_to_test.py +++ b/benchmarks/operator_benchmark/pt/tensor_to_test.py @@ -4,74 +4,84 @@ tensor_conversion_short_configs = op_bench.cross_product_configs( - M=( - 8, - 16, - 32, - ), - N=( - 16, - 64, - 128, - ), + M=[32], + N=[128], device=["cpu", "cuda"], + dtype_one=[ + torch.bool, + torch.uint8, + torch.int8, + torch.int16, + torch.int32, + torch.int64, + torch.half, + torch.bfloat16, + torch.float, + torch.double, + ], + dtype_two=[ + torch.bool, + torch.uint8, + torch.int8, + torch.int16, + torch.int32, + torch.int64, + torch.half, + torch.bfloat16, + torch.float, + torch.double, + ], tags=["short"], ) tensor_conversion_long_configs = op_bench.cross_product_configs( - M=( - 64, - 128, - 256, - 512, - ), - N=( - 256, - 512, - 1024, - 2048, - ), + M=[1024], + N=[1024], device=["cpu", "cuda"], + dtype_one=[ + torch.bool, + torch.uint8, + torch.int8, + torch.int16, + torch.int32, + torch.int64, + torch.half, + torch.bfloat16, + torch.float, + torch.double, + ], + dtype_two=[ + torch.bool, + torch.uint8, + torch.int8, + torch.int16, + torch.int32, + torch.int64, + torch.half, + torch.bfloat16, + torch.float, + torch.double, + ], tags=["long"], ) -class FloatToHalfTensorConversionBenchmark(op_bench.TorchBenchmarkBase): - def init(self, M, N, device): +class TensorConversionBenchmark(op_bench.TorchBenchmarkBase): + def init(self, M, N, dtype_one, dtype_two, device): self.inputs = { "input": torch.rand( M, N, device=device, requires_grad=False, dtype=torch.float - ) + ).to(dtype=dtype_one) } + self.dtype_one = dtype_one + self.dtype_two = dtype_two def forward(self, input): - return input.to(torch.half) + return input.to(dtype=self.dtype_two) -class HalfToFloatTensorConversionBenchmark(op_bench.TorchBenchmarkBase): - def init(self, M, N, device): - self.inputs = { - "input": torch.rand( - M, N, device=device, requires_grad=False, dtype=torch.half - ) - } - - def forward(self, input): - return input.to(torch.float) - - -op_bench.generate_pt_test( - tensor_conversion_short_configs, FloatToHalfTensorConversionBenchmark -) -op_bench.generate_pt_test( - tensor_conversion_long_configs, FloatToHalfTensorConversionBenchmark -) -op_bench.generate_pt_test( - tensor_conversion_short_configs, HalfToFloatTensorConversionBenchmark -) -op_bench.generate_pt_test( - tensor_conversion_long_configs, HalfToFloatTensorConversionBenchmark -) +op_bench.generate_pt_test(tensor_conversion_short_configs, TensorConversionBenchmark) +op_bench.generate_pt_test(tensor_conversion_long_configs, TensorConversionBenchmark) if __name__ == "__main__": op_bench.benchmark_runner.main() diff --git a/benchmarks/operator_benchmark/x86_64_expected_ci_operator_benchmark_eager_float32_cpu.csv b/benchmarks/operator_benchmark/x86_64_expected_ci_operator_benchmark_eager_float32_cpu.csv index d7a8e65aa85af..71a5930a01a3f 100644 --- a/benchmarks/operator_benchmark/x86_64_expected_ci_operator_benchmark_eager_float32_cpu.csv +++ b/benchmarks/operator_benchmark/x86_64_expected_ci_operator_benchmark_eager_float32_cpu.csv @@ -349,24 +349,106 @@ PyTorch,sum,sum_R256_V512_dim0_contiguousTrue_cpu,short,FALSE,12.5841 PyTorch,sum,sum_R256_V512_dim0_contiguousFALSE_cpu,short,FALSE,20.8765 PyTorch,sum,sum_R256_V512_dim1_contiguousTrue_cpu,short,FALSE,15.4414 PyTorch,sum,sum_R256_V512_dim1_contiguousFALSE_cpu,short,FALSE,15.3287 -PyTorch,FloatToHalfTensorConversionBenchmark,FloatToHalfTensorConversionBenchmark_M8_N16_cpu,short,FALSE,5.0499 -PyTorch,FloatToHalfTensorConversionBenchmark,FloatToHalfTensorConversionBenchmark_M8_N64_cpu,short,FALSE,5.3229 -PyTorch,FloatToHalfTensorConversionBenchmark,FloatToHalfTensorConversionBenchmark_M8_N128_cpu,short,FALSE,5.4418 -PyTorch,FloatToHalfTensorConversionBenchmark,FloatToHalfTensorConversionBenchmark_M16_N16_cpu,short,FALSE,5.0868 -PyTorch,FloatToHalfTensorConversionBenchmark,FloatToHalfTensorConversionBenchmark_M16_N64_cpu,short,FALSE,5.4495 -PyTorch,FloatToHalfTensorConversionBenchmark,FloatToHalfTensorConversionBenchmark_M16_N128_cpu,short,FALSE,5.5578 -PyTorch,FloatToHalfTensorConversionBenchmark,FloatToHalfTensorConversionBenchmark_M32_N16_cpu,short,FALSE,5.2631 -PyTorch,FloatToHalfTensorConversionBenchmark,FloatToHalfTensorConversionBenchmark_M32_N64_cpu,short,FALSE,5.5646 -PyTorch,FloatToHalfTensorConversionBenchmark,FloatToHalfTensorConversionBenchmark_M32_N128_cpu,short,FALSE,5.7898 -PyTorch,HalfToFloatTensorConversionBenchmark,HalfToFloatTensorConversionBenchmark_M8_N16_cpu,short,FALSE,5.0228 -PyTorch,HalfToFloatTensorConversionBenchmark,HalfToFloatTensorConversionBenchmark_M8_N64_cpu,short,FALSE,5.3692 -PyTorch,HalfToFloatTensorConversionBenchmark,HalfToFloatTensorConversionBenchmark_M8_N128_cpu,short,FALSE,5.4006 -PyTorch,HalfToFloatTensorConversionBenchmark,HalfToFloatTensorConversionBenchmark_M16_N16_cpu,short,FALSE,5.1107 -PyTorch,HalfToFloatTensorConversionBenchmark,HalfToFloatTensorConversionBenchmark_M16_N64_cpu,short,FALSE,5.4119 -PyTorch,HalfToFloatTensorConversionBenchmark,HalfToFloatTensorConversionBenchmark_M16_N128_cpu,short,FALSE,5.5583 -PyTorch,HalfToFloatTensorConversionBenchmark,HalfToFloatTensorConversionBenchmark_M32_N16_cpu,short,FALSE,5.3818 -PyTorch,HalfToFloatTensorConversionBenchmark,HalfToFloatTensorConversionBenchmark_M32_N64_cpu,short,FALSE,5.5742 -PyTorch,HalfToFloatTensorConversionBenchmark,HalfToFloatTensorConversionBenchmark_M32_N128_cpu,short,FALSE,6.8414 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bool_dtype_twotorch.bool,short,False,0.797 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bool_dtype_twotorch.uint8,short,False,6.071 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bool_dtype_twotorch.int8,short,False,6.031 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bool_dtype_twotorch.int16,short,False,6.243 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bool_dtype_twotorch.int32,short,False,7.231 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bool_dtype_twotorch.int64,short,False,7.791 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bool_dtype_twotorch.float16,short,False,12.661 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bool_dtype_twotorch.bfloat16,short,False,11.225 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bool_dtype_twotorch.float32,short,False,9.772 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bool_dtype_twotorch.float64,short,False,9.872 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.uint8_dtype_twotorch.bool,short,False,6.033 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.uint8_dtype_twotorch.uint8,short,False,0.781 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.uint8_dtype_twotorch.int8,short,False,6.060 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.uint8_dtype_twotorch.int16,short,False,6.180 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.uint8_dtype_twotorch.int32,short,False,7.258 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.uint8_dtype_twotorch.int64,short,False,7.758 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.uint8_dtype_twotorch.float16,short,False,10.504 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.uint8_dtype_twotorch.bfloat16,short,False,6.749 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.uint8_dtype_twotorch.float32,short,False,7.679 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.uint8_dtype_twotorch.float64,short,False,7.797 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int8_dtype_twotorch.bool,short,False,6.019 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int8_dtype_twotorch.uint8,short,False,6.079 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int8_dtype_twotorch.int8,short,False,0.785 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int8_dtype_twotorch.int16,short,False,6.188 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int8_dtype_twotorch.int32,short,False,7.288 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int8_dtype_twotorch.int64,short,False,7.770 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int8_dtype_twotorch.float16,short,False,10.466 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int8_dtype_twotorch.bfloat16,short,False,6.676 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int8_dtype_twotorch.float32,short,False,7.736 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int8_dtype_twotorch.float64,short,False,7.780 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int16_dtype_twotorch.bool,short,False,6.130 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int16_dtype_twotorch.uint8,short,False,6.221 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int16_dtype_twotorch.int8,short,False,6.101 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int16_dtype_twotorch.int16,short,False,0.791 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int16_dtype_twotorch.int32,short,False,6.254 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int16_dtype_twotorch.int64,short,False,7.733 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int16_dtype_twotorch.float16,short,False,10.562 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int16_dtype_twotorch.bfloat16,short,False,6.704 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int16_dtype_twotorch.float32,short,False,7.819 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int16_dtype_twotorch.float64,short,False,8.276 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int32_dtype_twotorch.bool,short,False,6.361 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int32_dtype_twotorch.uint8,short,False,6.364 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int32_dtype_twotorch.int8,short,False,6.309 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int32_dtype_twotorch.int16,short,False,6.362 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int32_dtype_twotorch.int32,short,False,0.791 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int32_dtype_twotorch.int64,short,False,7.746 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int32_dtype_twotorch.float16,short,False,9.462 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int32_dtype_twotorch.bfloat16,short,False,6.678 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int32_dtype_twotorch.float32,short,False,7.827 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int32_dtype_twotorch.float64,short,False,8.200 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int64_dtype_twotorch.bool,short,False,6.925 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int64_dtype_twotorch.uint8,short,False,6.947 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int64_dtype_twotorch.int8,short,False,6.962 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int64_dtype_twotorch.int16,short,False,6.906 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int64_dtype_twotorch.int32,short,False,7.664 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int64_dtype_twotorch.int64,short,False,0.782 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int64_dtype_twotorch.float16,short,False,10.528 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int64_dtype_twotorch.bfloat16,short,False,10.123 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int64_dtype_twotorch.float32,short,False,9.234 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int64_dtype_twotorch.float64,short,False,8.694 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float16_dtype_twotorch.bool,short,False,12.653 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float16_dtype_twotorch.uint8,short,False,9.348 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float16_dtype_twotorch.int8,short,False,8.774 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float16_dtype_twotorch.int16,short,False,9.063 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float16_dtype_twotorch.int32,short,False,10.012 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float16_dtype_twotorch.int64,short,False,13.641 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float16_dtype_twotorch.float16,short,False,0.788 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float16_dtype_twotorch.bfloat16,short,False,13.757 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float16_dtype_twotorch.float32,short,False,7.170 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float16_dtype_twotorch.float64,short,False,12.511 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bfloat16_dtype_twotorch.bool,short,False,6.516 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bfloat16_dtype_twotorch.uint8,short,False,8.539 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bfloat16_dtype_twotorch.int8,short,False,6.483 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bfloat16_dtype_twotorch.int16,short,False,6.468 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bfloat16_dtype_twotorch.int32,short,False,7.752 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bfloat16_dtype_twotorch.int64,short,False,9.868 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bfloat16_dtype_twotorch.float16,short,False,10.556 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bfloat16_dtype_twotorch.bfloat16,short,False,0.792 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bfloat16_dtype_twotorch.float32,short,False,7.577 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bfloat16_dtype_twotorch.float64,short,False,8.267 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float32_dtype_twotorch.bool,short,False,6.819 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float32_dtype_twotorch.uint8,short,False,7.715 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float32_dtype_twotorch.int8,short,False,6.754 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float32_dtype_twotorch.int16,short,False,6.825 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float32_dtype_twotorch.int32,short,False,7.790 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float32_dtype_twotorch.int64,short,False,9.219 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float32_dtype_twotorch.float16,short,False,5.977 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float32_dtype_twotorch.bfloat16,short,False,7.069 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float32_dtype_twotorch.float32,short,False,0.794 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float32_dtype_twotorch.float64,short,False,8.301 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float64_dtype_twotorch.bool,short,False,7.401 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float64_dtype_twotorch.uint8,short,False,7.843 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float64_dtype_twotorch.int8,short,False,7.117 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float64_dtype_twotorch.int16,short,False,7.170 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float64_dtype_twotorch.int32,short,False,8.000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float64_dtype_twotorch.int64,short,False,9.284 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float64_dtype_twotorch.float16,short,False,7.179 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float64_dtype_twotorch.bfloat16,short,False,7.645 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float64_dtype_twotorch.float32,short,False,7.988 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float64_dtype_twotorch.float64,short,False,0.792 PyTorch,relu,"relu_dims(3,4,5)_contigFALSE_inplaceFALSE_dtypetorch.quint8",short,FALSE,9.4657 PyTorch,relu,"relu_dims(3,4,5)_contigFALSE_inplaceFALSE_dtypetorch.qint8",short,FALSE,9.4625 PyTorch,relu,"relu_dims(3,4,5)_contigFALSE_inplaceFALSE_dtypetorch.qint32",short,FALSE,9.4165 diff --git a/benchmarks/sparse/spmm.py b/benchmarks/sparse/spmm.py index b707556dd7a15..b2c658d6faeb6 100644 --- a/benchmarks/sparse/spmm.py +++ b/benchmarks/sparse/spmm.py @@ -52,19 +52,18 @@ def test_sparse_coo_and_csr(m, n, k, nnz, test_count): start.record() coo.matmul(mat) stop.record() - times.append(start.elapsed_time(stop)) - coo_mean_time = sum(times) / len(times) + coo_mean_time = sum(times) / len(times) - times = [] - for _ in range(test_count): - start.record() - csr.matmul(mat) - stop.record() - times.append(start.elapsed_time(stop)) + times = [] + for _ in range(test_count): + start.record() + csr.matmul(mat) + stop.record() + times.append(start.elapsed_time(stop)) - csr_mean_time = sum(times) / len(times) + csr_mean_time = sum(times) / len(times) return coo_mean_time, csr_mean_time @@ -84,10 +83,13 @@ def test_sparse_coo_and_csr(m, n, k, nnz, test_count): if args.outfile == "stdout": outfile = sys.stdout + need_close = False elif args.outfile == "stderr": outfile = sys.stderr + need_close = False else: outfile = open(args.outfile, "a") + need_close = True test_count = args.test_count m = args.m @@ -148,3 +150,5 @@ def test_sparse_coo_and_csr(m, n, k, nnz, test_count): time, file=outfile, ) + if need_close: + outfile.close() diff --git a/benchmarks/sparse/spmv.py b/benchmarks/sparse/spmv.py index f8900882ca4ec..3e9502686a884 100644 --- a/benchmarks/sparse/spmv.py +++ b/benchmarks/sparse/spmv.py @@ -82,10 +82,13 @@ def test_sparse_coo_and_csr(m, nnz, test_count): if args.outfile == "stdout": outfile = sys.stdout + need_close = False elif args.outfile == "stderr": outfile = sys.stderr + need_close = False else: outfile = open(args.outfile, "a") + need_close = True test_count = args.test_count m = args.m @@ -132,3 +135,5 @@ def test_sparse_coo_and_csr(m, nnz, test_count): time_csr, file=outfile, ) + if need_close: + outfile.close() diff --git a/benchmarks/sparse/triton_ops.py b/benchmarks/sparse/triton_ops.py index 48a88d592ea2c..a49a53bcd207c 100644 --- a/benchmarks/sparse/triton_ops.py +++ b/benchmarks/sparse/triton_ops.py @@ -179,10 +179,13 @@ def integer_or_float_list(a): if args.outfile == "stdout": outfile = sys.stdout + need_close = False elif args.outfile == "stderr": outfile = sys.stderr + need_close = False else: outfile = open(args.outfile, "a") + need_close = True ops = args.ops.split(",") @@ -434,3 +437,5 @@ def show_best_messages(best_messages=best_messages): if op not in {"bsr_scatter_mm6", "bsr_dense_mm_with_meta"}: # Break on operations that do not consume parameters break + if need_close: + outfile.close() diff --git a/benchmarks/transformer/score_mod.py b/benchmarks/transformer/score_mod.py index 928cbf27df5b1..e9af132df28a9 100644 --- a/benchmarks/transformer/score_mod.py +++ b/benchmarks/transformer/score_mod.py @@ -125,6 +125,17 @@ def wrapper(config, *args, **kwargs): ] DtypeString = Literal["bfloat16", "float16", "float32"] SpeedupType = Literal["fwd", "bwd"] +# Operator Name mapping +backend_to_operator_name = { + "math": "math attention kernel", + "efficient": "efficient attention kernel", + "cudnn": "cudnn attention kernel", + "fav2": "flash attention 2 kernel", + "fav3": "flash attention 3 kernel", + "fakv": "flash attention kv cache kernel", + "og-eager": "eager attention kernel", + "flex": "flex attention kernel", +} def benchmark_torch_function_in_microseconds(func: Callable, *args, **kwargs) -> float: @@ -1265,12 +1276,14 @@ class BenchmarkRecord: model: ModelInfo metric: MetricInfo + operator_name = backend_to_operator_name.get(backend, backend) + # Benchmark extra info benchmark_extra_info = { "input_config": input_config, "device": device, "arch": device_arch, - "operator_name": backend, + "operator_name": operator_name, "attn_type": config.attn_type, "shape": str(config.shape), "max_autotune": config.max_autotune, @@ -1288,7 +1301,7 @@ class BenchmarkRecord: type="attention-benchmark", origins=["pytorch"], extra_info={ - "operator_name": backend, + "operator_name": operator_name, "attn_type": config.attn_type, }, ), @@ -1315,7 +1328,7 @@ class BenchmarkRecord: type="attention-benchmark", origins=["pytorch"], extra_info={ - "operator_name": backend, + "operator_name": operator_name, }, ), metric=MetricInfo( @@ -1341,7 +1354,7 @@ class BenchmarkRecord: type="attention-benchmark", origins=["pytorch"], extra_info={ - "operator_name": backend, + "operator_name": operator_name, }, ), metric=MetricInfo( @@ -1371,7 +1384,7 @@ class BenchmarkRecord: type="attention-benchmark", origins=["pytorch"], extra_info={ - "operator_name": backend, + "operator_name": operator_name, }, ), metric=MetricInfo( diff --git a/buckbuild.bzl b/buckbuild.bzl index 4c1affd10e1bc..9f18ad4849dde 100644 --- a/buckbuild.bzl +++ b/buckbuild.bzl @@ -8,7 +8,7 @@ load("//tools/build_defs:fb_xplat_genrule.bzl", "fb_xplat_genrule") load("//tools/build_defs/windows:windows_flag_map.bzl", "windows_convert_gcc_clang_flags") load("//tools/build_defs:fbsource_utils.bzl", "is_arvr_mode") load("//tools/build_defs:glob_defs.bzl", "subdir_glob") -load("//tools/build_defs:platform_defs.bzl", "APPLETVOS", "IOS", "MACOSX") +load("//tools/build_defs:platform_defs.bzl", "IOS", "MACOSX") load("//tools/build_defs:type_defs.bzl", "is_list", "is_string") load("//tools/build_defs/android:build_mode_defs.bzl", is_production_build_android = "is_production_build") load("//tools/build_defs/apple:build_mode_defs.bzl", is_production_build_ios = "is_production_build", is_profile_build_ios = "is_profile_build") @@ -1090,7 +1090,7 @@ def define_buck_targets( srcs = [ "caffe2/core/common.cc", ], - apple_sdks = (IOS, MACOSX, APPLETVOS), + apple_sdks = (IOS, MACOSX), compiler_flags = get_pt_compiler_flags(), labels = labels, # @lint-ignore BUCKLINT link_whole diff --git a/build_variables.bzl b/build_variables.bzl index 70121e19d8099..258e739300c1e 100644 --- a/build_variables.bzl +++ b/build_variables.bzl @@ -1025,6 +1025,7 @@ libtorch_python_core_sources = [ libtorch_python_distributed_core_sources = [ "torch/csrc/distributed/c10d/init.cpp", "torch/csrc/distributed/c10d/python_comm_hook.cpp", + "torch/csrc/distributed/c10d/python_callback_work.cpp", ] libtorch_python_distributed_sources = libtorch_python_distributed_core_sources + [ diff --git a/c10/core/Allocator.h b/c10/core/Allocator.h index 747b73da01352..7d2c814fe84f7 100644 --- a/c10/core/Allocator.h +++ b/c10/core/Allocator.h @@ -19,6 +19,17 @@ namespace c10 { +using CaptureId_t = unsigned long long; +// first is set if the instance is created by CUDAGraph::capture_begin. +// second is set if the instance is created by at::cuda::graph_pool_handle. +using MempoolId_t = std::pair; + +struct MempoolIdHash { + std::size_t operator()(const MempoolId_t& mempool_id) const noexcept { + return mempool_id.first != 0 ? mempool_id.first : mempool_id.second; + } +}; + // A DataPtr is a unique pointer (with an attached deleter and some // context for the deleter) to some memory, which also records what // device is for its data. diff --git a/c10/core/AutogradState.h b/c10/core/AutogradState.h index ad168b8c05987..d2b9cc080413d 100644 --- a/c10/core/AutogradState.h +++ b/c10/core/AutogradState.h @@ -1,6 +1,8 @@ #pragma once +#include #include +#include namespace c10 { @@ -15,7 +17,8 @@ struct C10_API AutogradState { bool inference_mode, bool fw_grad_mode, bool multithreading_enabled) - : grad_mode_(grad_mode), + : graph_exec_group_(std::nullopt), + grad_mode_(grad_mode), inference_mode_(inference_mode), fw_grad_mode_(fw_grad_mode), multithreading_enabled_(multithreading_enabled), @@ -41,6 +44,10 @@ struct C10_API AutogradState { view_replay_enabled_ = view_replay_enabled; } + void set_graph_exec_group(std::optional group) { + graph_exec_group_ = std::move(group); + } + bool get_grad_mode() const { return grad_mode_; } @@ -61,7 +68,12 @@ struct C10_API AutogradState { return view_replay_enabled_; } + const std::optional& get_graph_exec_group() const { + return graph_exec_group_; + } + private: + std::optional graph_exec_group_; bool grad_mode_ : 1; bool inference_mode_ : 1; bool fw_grad_mode_ : 1; diff --git a/c10/core/CachingDeviceAllocator.h b/c10/core/CachingDeviceAllocator.h index 0bec03ae417fa..c95d0714ce3bd 100644 --- a/c10/core/CachingDeviceAllocator.h +++ b/c10/core/CachingDeviceAllocator.h @@ -96,6 +96,13 @@ struct C10_API DeviceAllocator : public c10::Allocator { // Resets peak memory usage statistics for the specified device virtual void resetPeakStats(c10::DeviceIndex device) = 0; + + // Return the free memory size and total memory size in bytes for the + // specified device. + virtual std::pair getMemoryInfo(c10::DeviceIndex device) { + TORCH_CHECK_NOT_IMPLEMENTED( + false, "getMemoryInfo is not implemented for this allocator yet."); + } }; // This function is used to get the DeviceAllocator for a specific device type diff --git a/c10/core/DispatchKeySet.cpp b/c10/core/DispatchKeySet.cpp index 72e72f49a5e40..107530e9e28a2 100644 --- a/c10/core/DispatchKeySet.cpp +++ b/c10/core/DispatchKeySet.cpp @@ -59,6 +59,9 @@ constexpr DispatchKeySet nested_dispatch_keyset = {DispatchKey::AutogradNestedTensor, DispatchKey::NestedTensor}) | DispatchKeySet(DispatchKeySet::RAW, full_backend_mask); +constexpr DispatchKeySet functorch_batched_dispatch_keyset = + DispatchKeySet(DispatchKey::FuncTorchBatched); + DispatchKeySet getRuntimeDispatchKeySet(DispatchKey t) { TORCH_INTERNAL_ASSERT(t != DispatchKey::Undefined); switch (t) { @@ -77,6 +80,8 @@ DispatchKeySet getRuntimeDispatchKeySet(DispatchKey t) { return backend_dispatch_keyset; case DispatchKey::CompositeExplicitAutogradNonFunctional: return non_functional_backend_dispatch_keyset; + case DispatchKey::FuncTorchBatchedDecomposition: + return functorch_batched_dispatch_keyset; default: return DispatchKeySet(t); } diff --git a/c10/core/ScalarType.h b/c10/core/ScalarType.h index ba1068e72695c..d8885804505a1 100644 --- a/c10/core/ScalarType.h +++ b/c10/core/ScalarType.h @@ -27,6 +27,7 @@ #include C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wswitch-enum") +C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wswitch-default") namespace c10 { @@ -205,6 +206,12 @@ inline bool isSignedType(ScalarType t) { break; // Do not add default here, but rather define behavior of every new entry // here. `-Wswitch-enum` would raise a warning in those cases. + // TODO: get PyTorch to adopt exhaustive switches by default with a way to + // opt specific switches to being non-exhaustive. + // Exhaustive: + // `-Wswitch-enum`, `-Wswitch-default`, `-Wno-covered-switch-default` + // Non-Exhaustive: + // `-Wno-switch-enum`, `-Wswitch-default`, `-Wcovered-switch-default` } TORCH_CHECK(false, "Unknown ScalarType ", t); #undef CASE_ISSIGNED diff --git a/c10/core/SymBool.cpp b/c10/core/SymBool.cpp index d804eb9d27409..48c407b8b069c 100644 --- a/c10/core/SymBool.cpp +++ b/c10/core/SymBool.cpp @@ -1,4 +1,5 @@ #include +#include #include namespace c10 { @@ -111,4 +112,17 @@ bool SymBool::has_hint() const { return toSymNodeImpl()->has_hint(); } +SymInt SymBool::toSymInt() const { + // If concrete bool, return concrete SymInt + if (auto ma = maybe_as_bool()) { + return SymInt(*ma ? 1 : 0); + } + + // Symbolic case: use sym_ite to convert bool to int (0 or 1) + auto node = toSymNodeImpl(); + auto one_node = node->wrap_int(1); + auto zero_node = node->wrap_int(0); + return SymInt(node->sym_ite(one_node, zero_node)); +} + } // namespace c10 diff --git a/c10/core/SymBool.h b/c10/core/SymBool.h index d5d509e239b1d..a27a28a5bf8a3 100644 --- a/c10/core/SymBool.h +++ b/c10/core/SymBool.h @@ -12,6 +12,8 @@ namespace c10 { +class SymInt; + class C10_API SymBool { public: /*implicit*/ SymBool(bool b) : data_(b) {} @@ -80,6 +82,10 @@ class C10_API SymBool { return toSymNodeImplUnowned()->constant_bool(); } + // Convert SymBool to SymInt (0 or 1) + // This is the C++ equivalent of Python's cast_symbool_to_symint_guardless + SymInt toSymInt() const; + bool is_heap_allocated() const { return ptr_; } diff --git a/c10/core/TensorImpl.h b/c10/core/TensorImpl.h index 66893b86c8469..420ed73e48d21 100644 --- a/c10/core/TensorImpl.h +++ b/c10/core/TensorImpl.h @@ -57,6 +57,8 @@ C10_DECLARE_bool(caffe2_keep_on_shrink); // respect caffe2_keep_on_shrink. C10_DECLARE_int64(caffe2_max_keep_on_shrink_memory); +C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wswitch-default") + namespace at { class Tensor; class TensorBase; @@ -3303,3 +3305,5 @@ static_assert( #undef C10_GCC_VERSION_MINOR } // namespace c10 + +C10_DIAGNOSTIC_POP() diff --git a/c10/cuda/CUDAAllocatorConfig.cpp b/c10/cuda/CUDAAllocatorConfig.cpp index 3046259b48a3e..5414d838cd8c4 100644 --- a/c10/cuda/CUDAAllocatorConfig.cpp +++ b/c10/cuda/CUDAAllocatorConfig.cpp @@ -106,6 +106,9 @@ void CUDAAllocatorConfig::parseArgs(const std::string& env) { } else if (key == "graph_capture_record_stream_reuse") { i = parseGraphCaptureRecordStreamReuse(tokenizer, i); used_native_specific_option = true; + } else if (key == "per_process_memory_fraction") { + i = parsePerProcessMemoryFraction(tokenizer, i); + used_native_specific_option = true; } else { const auto& keys = c10::CachingAllocator::AcceleratorAllocatorConfig::getKeys(); @@ -146,6 +149,18 @@ size_t CUDAAllocatorConfig::parseGraphCaptureRecordStreamReuse( return i; } +double CUDAAllocatorConfig::parsePerProcessMemoryFraction( + const c10::CachingAllocator::ConfigTokenizer& tokenizer, + size_t i) { + tokenizer.checkToken(++i, ":"); + double val_env = tokenizer.toDouble(++i); + TORCH_CHECK_VALUE( + val_env >= 0.0 && val_env <= 1.0, + "per_process_memory_fraction is invalid, set it in [0.0, 1.0]"); + m_per_process_memory_fraction = val_env; + return i; +} + size_t CUDAAllocatorConfig::parsePinnedNumRegisterThreads( const c10::CachingAllocator::ConfigTokenizer& tokenizer, size_t i) { diff --git a/c10/cuda/CUDAAllocatorConfig.h b/c10/cuda/CUDAAllocatorConfig.h index d61f69467a2dc..4e6097a406bc2 100644 --- a/c10/cuda/CUDAAllocatorConfig.h +++ b/c10/cuda/CUDAAllocatorConfig.h @@ -61,6 +61,10 @@ class C10_CUDA_API CUDAAllocatorConfig { return instance().m_graph_capture_record_stream_reuse; } + static double per_process_memory_fraction() { + return instance().m_per_process_memory_fraction; + } + /** Pinned memory allocator settings */ static bool pinned_use_cuda_host_register() { return instance().m_pinned_use_cuda_host_register; @@ -152,7 +156,8 @@ class C10_CUDA_API CUDAAllocatorConfig { "pinned_use_hip_host_register", "graph_capture_record_stream_reuse", "pinned_reserve_segment_size_mb", - "pinned_num_register_threads"}; + "pinned_num_register_threads", + "per_process_memory_fraction"}; return keys; } @@ -177,6 +182,9 @@ class C10_CUDA_API CUDAAllocatorConfig { size_t parseGraphCaptureRecordStreamReuse( const c10::CachingAllocator::ConfigTokenizer& tokenizer, size_t i); + double parsePerProcessMemoryFraction( + const c10::CachingAllocator::ConfigTokenizer& tokenizer, + size_t i); std::atomic m_pinned_num_register_threads{1}; std::atomic m_pinned_reserve_segment_size_mb{0}; @@ -189,6 +197,7 @@ class C10_CUDA_API CUDAAllocatorConfig { std::atomic m_release_lock_on_cudamalloc{false}; std::atomic m_pinned_use_cuda_host_register{false}; std::atomic m_graph_capture_record_stream_reuse{false}; + std::atomic m_per_process_memory_fraction{1.0}; }; // Keep this for backwards compatibility diff --git a/c10/cuda/CUDACachingAllocator.cpp b/c10/cuda/CUDACachingAllocator.cpp index 091e580f95819..9e7823a394302 100644 --- a/c10/cuda/CUDACachingAllocator.cpp +++ b/c10/cuda/CUDACachingAllocator.cpp @@ -1012,12 +1012,6 @@ PrivatePoolState::PrivatePoolState( } } -struct MempoolIdHash { - std::size_t operator()(const MempoolId_t& mempool_id) const noexcept { - return mempool_id.first != 0 ? mempool_id.first : mempool_id.second; - } -}; - cudaError_t allocPrimitive(void** ptr, size_t size, AllocParams& p) { if (p.pool->owner_PrivatePool && p.pool->owner_PrivatePool->allocator()) { *ptr = p.pool->owner_PrivatePool->allocator()->raw_alloc(size); @@ -1100,7 +1094,7 @@ class RingBuffer { } // anonymous namespace } // namespace Native -static std::string reportProcessMemoryInfo(c10::DeviceIndex device) { +static std::string reportProcessMemoryInfo(const cudaDeviceProp& prop) { #ifdef PYTORCH_C10_DRIVER_API_SUPPORTED void* nvml_handle = DriverAPI::get_nvml_handle(); if (!nvml_handle) { @@ -1111,9 +1105,6 @@ static std::string reportProcessMemoryInfo(c10::DeviceIndex device) { return true; }(); - cudaDeviceProp prop{}; - C10_CUDA_CHECK(cudaGetDeviceProperties(&prop, device)); - // NOLINTNEXTLINE(*-c-arrays) char pci_id[80]; snprintf( @@ -1215,14 +1206,16 @@ class DeviceCachingAllocator { // record used memory. size_t total_allocated_memory = 0; - size_t allowed_memory_maximum = 0; + cudaDeviceProp device_prop; + + // maximum amount of memory that device is allowed to + // allocate. This is set iff memory fraction is less than 1 + std::optional allowed_memory_maximum{std::nullopt}; // all live expandable segments std::vector expandable_segments_; std::vector devices_with_peer_access_; - bool set_fraction = false; - bool record_history = false; std::atomic context_recorder_; @@ -1264,6 +1257,9 @@ class DeviceCachingAllocator { : device_id(id), large_blocks(/*small=*/false), small_blocks(/*small=*/true) { + C10_CUDA_CHECK(cudaGetDeviceProperties(&device_prop, id)); + + setMemoryFraction(CUDAAllocatorConfig::per_process_memory_fraction()); stats.max_split_size = static_cast(AcceleratorAllocatorConfig::max_split_size()); context_recorder_.store(nullptr); @@ -1399,7 +1395,7 @@ class DeviceCachingAllocator { if (!block_found) { // Do garbage collection if the flag is set. if (C10_UNLIKELY( - set_fraction && + allowed_memory_maximum.has_value() && AcceleratorAllocatorConfig::garbage_collection_threshold() > 0.0)) { garbage_collect_cached_blocks(context); @@ -1456,11 +1452,12 @@ class DeviceCachingAllocator { C10_CUDA_CHECK(cudaMemGetInfo(&device_free, &device_total)); std::string allowed_info; - if (set_fraction) { - allowed_info = format_size(allowed_memory_maximum) + " allowed; "; + if (allowed_memory_maximum.has_value()) { + allowed_info = + format_size(allowed_memory_maximum.value()) + " allowed; "; } - std::string proc_info = reportProcessMemoryInfo(device_id); + std::string proc_info = reportProcessMemoryInfo(device_prop); record_trace( TraceEntry::OOM, @@ -1518,7 +1515,7 @@ class DeviceCachingAllocator { for (const auto& obs : observers_local) { obs(device_id, alloc_size, - set_fraction ? allowed_memory_maximum : device_total, + allowed_memory_maximum.value_or(device_total), device_free); } @@ -2015,25 +2012,26 @@ class DeviceCachingAllocator { /** get memory fraction limiting maximum allocated memory **/ double getMemoryFraction() { - if (!set_fraction) { + if (!allowed_memory_maximum.has_value()) { return 1.0; } - size_t device_free = 0; - size_t device_total = 0; - C10_CUDA_CHECK(cudaMemGetInfo(&device_free, &device_total)); - return static_cast(allowed_memory_maximum) / - static_cast(device_total); + return static_cast(allowed_memory_maximum.value()) / + static_cast(device_prop.totalGlobalMem); } /** set memory fraction to limit maximum allocated memory **/ void setMemoryFraction(double fraction) { - size_t device_free = 0; - size_t device_total = 0; - C10_CUDA_CHECK(cudaMemGetInfo(&device_free, &device_total)); - allowed_memory_maximum = - static_cast(fraction * static_cast(device_total)); - set_fraction = true; + TORCH_CHECK( + 0 <= fraction && fraction <= 1, + "invalid fraction:", + fraction, + ". Please set within [0, 1]."); + allowed_memory_maximum = std::nullopt; + if (fraction < 1.0) { + allowed_memory_maximum = static_cast( + fraction * static_cast(device_prop.totalGlobalMem)); + } } /** get expandable segment size for all the streams on device **/ @@ -3010,7 +3008,7 @@ class DeviceCachingAllocator { BlockPool& pool = *p.pool; if (C10_UNLIKELY( - set_fraction && + allowed_memory_maximum.has_value() && AcceleratorAllocatorConfig::garbage_collection_threshold() > 0.0)) { // Track block reuse interval only when garbage collection is enabled. ++pool.get_free_blocks_call_count; @@ -3083,7 +3081,7 @@ class DeviceCachingAllocator { size_t gc_threshold = static_cast( AcceleratorAllocatorConfig::garbage_collection_threshold() * - static_cast(allowed_memory_maximum)); + static_cast(allowed_memory_maximum.value())); // No need to trigger GC yet if (total_allocated_memory <= gc_threshold) { return; @@ -3161,8 +3159,8 @@ class DeviceCachingAllocator { bool active_pool = p.pool->owner_PrivatePool && p.pool->owner_PrivatePool->allocator(); - if (set_fraction && - total_allocated_memory + size > allowed_memory_maximum) { + if (allowed_memory_maximum.has_value() && + total_allocated_memory + size > allowed_memory_maximum.value()) { p.err = cudaErrorMemoryAllocation; return false; // Temporarily disable checkpointing & cudagraphs internally @@ -3859,7 +3857,6 @@ class NativeCachingAllocator : public CUDAAllocator { "Allocator not initialized for device ", device, ": did you call init?"); - C10_CUDA_CHECK(c10::cuda::SetDevice(device)); return device_allocator[device]->getMemoryFraction(); } @@ -3869,12 +3866,6 @@ class NativeCachingAllocator : public CUDAAllocator { "Allocator not initialized for device ", device, ": did you call init?"); - TORCH_CHECK( - 0 <= fraction && fraction <= 1, - "invalid fraction:", - fraction, - ". Please set within [0, 1]."); - C10_CUDA_CHECK(c10::cuda::SetDevice(device)); device_allocator[device]->setMemoryFraction(fraction); } @@ -4513,66 +4504,3 @@ std::atomic allocator; static BackendStaticInitializer backend_static_initializer; } // namespace cuda::CUDACachingAllocator } // namespace c10 - -namespace c10::cuda { - -// uid_ is incremented when a user creates a MemPool, -// for example: using graph_pool_handle() or c10::cuda::MemPool(). -// -// uuid_ is incremented when CUDAGraph creates a MemPool -// as a result of a user not providing a pool. -// -// MempoolId_t of {0, 0} is used to denote when no MemPool has been -// passed to a function, either by user or CUDAGraphs. For example, -// default value of MempoolId_t for capture_begin function is {0, 0}. -// That's why uid_ and uuid_ start at 1. -std::atomic MemPool::uid_{1}; -std::atomic MemPool::uuid_{1}; - -MemPool::MemPool( - CUDACachingAllocator::CUDAAllocator* allocator, - bool is_user_created, - bool use_on_oom) - : allocator_(allocator), is_user_created_(is_user_created) { - if (is_user_created_) { - id_ = {0, uid_++}; - } else { - id_ = {uuid_++, 0}; - } - device_ = c10::cuda::current_device(); - CUDACachingAllocator::createOrIncrefPool(device_, id_, allocator); - if (use_on_oom) { - CUDACachingAllocator::setUseOnOOM(device_, id_); - } -} - -MemPool::~MemPool() { - TORCH_INTERNAL_ASSERT(use_count() == 1); - CUDACachingAllocator::releasePool(device_, id_); - c10::cuda::CUDACachingAllocator::emptyCache(id_); -} - -MempoolId_t MemPool::id() { - return id_; -} - -CUDACachingAllocator::CUDAAllocator* MemPool::allocator() { - return allocator_; -} - -int MemPool::use_count() { - return CUDACachingAllocator::getPoolUseCount(device_, id_); -} - -c10::DeviceIndex MemPool::device() { - return device_; -} - -MempoolId_t MemPool::graph_pool_handle(bool is_user_created) { - if (is_user_created) { - return {0, uid_++}; - } - return {uuid_++, 0}; -} - -} // namespace c10::cuda diff --git a/c10/cuda/CUDACachingAllocator.h b/c10/cuda/CUDACachingAllocator.h index fbe5dab18e0ae..e7b45072f6c20 100644 --- a/c10/cuda/CUDACachingAllocator.h +++ b/c10/cuda/CUDACachingAllocator.h @@ -2,6 +2,7 @@ #include #include +#include #include #include #include @@ -344,6 +345,13 @@ class CUDAAllocator : public DeviceAllocator { c10::DeviceIndex device, std::shared_ptr pps) = 0; virtual std::string name() = 0; + std::pair getMemoryInfo(c10::DeviceIndex device) override { + c10::DeviceGuard device_guard({at::kCUDA, device}); + size_t free = 0; + size_t total = 0; + C10_CUDA_CHECK(cudaMemGetInfo(&free, &total)); + return {free, total}; + } }; // Allocator object, statically initialized @@ -554,41 +562,7 @@ inline std::string getUserMetadata() { } // namespace c10::cuda::CUDACachingAllocator namespace c10::cuda { - // Keep BC only using c10::CaptureId_t; using c10::MempoolId_t; - -// MemPool represents a pool of memory in a caching allocator. Currently, -// it's just the ID of the pool object maintained in the CUDACachingAllocator. -// -// An allocator pointer can be passed to the MemPool to define how the -// allocations should be done in the pool. For example: using a different -// system allocator such as ncclMemAlloc. -struct C10_CUDA_API MemPool { - MemPool( - CUDACachingAllocator::CUDAAllocator* allocator = nullptr, - bool is_user_created = true, - bool use_on_oom = false); - MemPool(const MemPool&) = delete; - MemPool(MemPool&&) = default; - MemPool& operator=(const MemPool&) = delete; - MemPool& operator=(MemPool&&) = default; - ~MemPool(); - - MempoolId_t id(); - CUDACachingAllocator::CUDAAllocator* allocator(); - int use_count(); - c10::DeviceIndex device(); - static MempoolId_t graph_pool_handle(bool is_user_created = true); - - private: - static std::atomic uid_; - static std::atomic uuid_; - CUDACachingAllocator::CUDAAllocator* allocator_; - bool is_user_created_; - MempoolId_t id_; - c10::DeviceIndex device_; -}; - } // namespace c10::cuda diff --git a/c10/cuda/CUDADeviceAssertionHost.cpp b/c10/cuda/CUDADeviceAssertionHost.cpp index d67ee4b23e692..9b7c3568a9833 100644 --- a/c10/cuda/CUDADeviceAssertionHost.cpp +++ b/c10/cuda/CUDADeviceAssertionHost.cpp @@ -295,11 +295,19 @@ DeviceAssertionsData* CUDAKernelLaunchRegistry:: C10_CUDA_CHECK_WO_DSA( cudaMallocManaged(&uvm_assertions_ptr, sizeof(DeviceAssertionsData))); +#if CUDART_VERSION >= 13000 + cudaMemLocation cpuDevice; + cpuDevice.type = cudaMemLocationTypeDevice; + cpuDevice.id = cudaCpuDeviceId; +#else + const auto cpuDevice = cudaCpuDeviceId; +#endif + C10_CUDA_CHECK_WO_DSA(cudaMemAdvise( uvm_assertions_ptr, sizeof(DeviceAssertionsData), cudaMemAdviseSetPreferredLocation, - cudaCpuDeviceId)); + cpuDevice)); // GPU will establish direct mapping of data in CPU memory, no page faults // will be generated @@ -307,7 +315,7 @@ DeviceAssertionsData* CUDAKernelLaunchRegistry:: uvm_assertions_ptr, sizeof(DeviceAssertionsData), cudaMemAdviseSetAccessedBy, - cudaCpuDeviceId)); + cpuDevice)); // Initialize the memory from the CPU; otherwise, pages may have to be created // on demand. We think that UVM documentation indicates that first access may diff --git a/c10/cuda/CUDAMallocAsyncAllocator.cpp b/c10/cuda/CUDAMallocAsyncAllocator.cpp index 93bce51f1b9d0..674eb00035c50 100644 --- a/c10/cuda/CUDAMallocAsyncAllocator.cpp +++ b/c10/cuda/CUDAMallocAsyncAllocator.cpp @@ -427,7 +427,6 @@ struct CudaMallocAsyncAllocator : public CUDAAllocator { // on the current device each later call sees. void init(int dev_count) override { static bool called = [](int dev_count) { - ; // Are there external guarantees init will be called before // any of the allocator's other functions? // std::lock_guard lk(general_mutex); diff --git a/c10/metal/error.h b/c10/metal/error.h new file mode 100644 index 0000000000000..bed113769747a --- /dev/null +++ b/c10/metal/error.h @@ -0,0 +1,111 @@ +#pragma once +#include + +namespace c10 { +namespace metal { +C10_METAL_CONSTEXPR unsigned error_message_count = 30; +struct ErrorMessage { + char file[128]; + char func[128]; + char message[250]; + unsigned int line; +}; + +struct ErrorMessages { +#ifdef __METAL__ + ::metal::atomic count; +#else + unsigned int count; +#endif + ErrorMessage msg[error_message_count]; +}; + +#ifdef __METAL__ +namespace detail { +static uint strncpy(device char* dst, constant const char* src, unsigned len) { + uint i = 0; + while (src[i] != 0 && i < len - 1) { + dst[i] = src[i]; + i++; + } + dst[i] = 0; + return i; +} + +inline uint print_arg( + device char* ptr, + unsigned len, + constant const char* arg) { + return strncpy(ptr, arg, len); +} + +// Returns number length as string in base10 +static inline uint base10_length(long num) { + uint rc = 1; + if (num < 0) { + num = -num; + rc += 1; + } + while (num > 9) { + num /= 10; + rc++; + } + return rc; +} + +// Converts signed integer to string +inline uint print_arg(device char* ptr, unsigned len, long arg) { + const auto arg_len = base10_length(arg); + if (arg_len >= len) + return 0; + if (arg < 0) { + ptr[0] = '-'; + arg = -arg; + } + uint idx = 1; + do { + ptr[arg_len - idx] = '0' + (arg % 10); + arg /= 10; + idx++; + } while (arg > 0); + ptr[arg_len] = 0; + return arg_len; +} + +template +inline void print_args(device char* ptr, unsigned len, T arg) { + print_arg(ptr, len, arg); +} + +template +inline void print_args(device char* ptr, unsigned len, T arg, Args... args) { + const auto rc = print_arg(ptr, len, arg); + print_args(ptr + rc, len - rc, args...); +} + +} // namespace detail + +template +static void report_error( + device ErrorMessages* msgs, + constant const char* file, + int line, + constant const char* func, + Args... args) { + const auto idx = + atomic_fetch_add_explicit(&msgs->count, 1, ::metal::memory_order_relaxed); + if (idx >= error_message_count) { + return; + } + device auto* msg = &msgs->msg[idx]; + detail::strncpy(msg->file, file, 128); + detail::strncpy(msg->func, func, 128); + detail::print_args(msg->message, 250, args...); + msg->line = line; +} + +#define TORCH_REPORT_ERROR(buf, ...) \ + ::c10::metal::report_error(buf, __FILE__, __LINE__, __func__, __VA_ARGS__) +#endif +} // namespace metal +} // namespace c10 diff --git a/c10/test/build.bzl b/c10/test/build.bzl index deb917dd8fcf3..7b4028ab4afed 100644 --- a/c10/test/build.bzl +++ b/c10/test/build.bzl @@ -66,6 +66,15 @@ def define_targets(rules): ], ) + rules.cc_test( + name = "util/nofatal_test", + srcs = ["util/nofatal_test.cpp"], + deps = [ + "//c10/util:base", + "@com_google_googletest//:gtest_main", + ], + ) + rules.cc_test( name = "util/ssize_test", srcs = ["util/ssize_test.cpp"], diff --git a/c10/test/util/nofatal_test.cpp b/c10/test/util/nofatal_test.cpp new file mode 100644 index 0000000000000..ba4b40b6f917e --- /dev/null +++ b/c10/test/util/nofatal_test.cpp @@ -0,0 +1,53 @@ +#include + +#include +#include + +namespace { +template +inline void expectThrowsEq(T&& fn, const char* expected_msg) { + try { + std::forward(fn)(); + } catch (const c10::Error& e) { + EXPECT_TRUE( + std::string(e.what_without_backtrace()).find(expected_msg) != + std::string::npos); + return; + } + ADD_FAILURE() << "Expected to throw exception with message \"" << expected_msg + << "\" but didn't throw"; +} +} // namespace + +TEST(NofatalTest, TorchCheckComparisons) { + // quick make sure that no-op works as expected + TORCH_CHECK_EQ(1, 1) << "i am a silly message " << 1; + expectThrowsEq( + []() { TORCH_CHECK_EQ(1, 2) << "i am a silly message " << 1; }, + "Check failed: 1 == 2 (1 vs. 2). i am a silly message 1"); + expectThrowsEq( + []() { TORCH_CHECK_NE(2, 2); }, "Check failed: 2 != 2 (2 vs. 2)."); + expectThrowsEq( + []() { TORCH_CHECK_LT(2, 2); }, "Check failed: 2 < 2 (2 vs. 2)."); + expectThrowsEq( + []() { TORCH_CHECK_LE(3, 2); }, "Check failed: 3 <= 2 (3 vs. 2)."); + expectThrowsEq( + []() { TORCH_CHECK_GT(2, 2); }, "Check failed: 2 > 2 (2 vs. 2)."); + expectThrowsEq( + []() { TORCH_CHECK_GE(2, 3); }, "Check failed: 2 >= 3 (2 vs. 3)."); + expectThrowsEq( + []() { + void* p = nullptr; + TORCH_CHECK_NOTNULL(p); + }, + "Check failed: 'p' must be non NULL."); + +#if GTEST_HAS_DEATH_TEST +#ifndef NDEBUG + // if dbg build, DCHECK should result in deth + EXPECT_DEATH(TORCH_DCHECK_EQ(1, 2), "Check failed"); +#else + TORCH_DCHECK_EQ(1, 2); // no-op +#endif +#endif // GTEST_HAS_DEATH_TEST +} diff --git a/c10/util/ArrayRef.h b/c10/util/ArrayRef.h index 64605f5153595..1311867ef797e 100644 --- a/c10/util/ArrayRef.h +++ b/c10/util/ArrayRef.h @@ -18,6 +18,7 @@ #include #include #include +#include #include #include @@ -40,200 +41,99 @@ namespace c10 { /// /// This is intended to be trivially copyable, so it should be passed by /// value. +/// +/// NOTE: We have refactored out the headeronly parts of the ArrayRef struct +/// into HeaderOnlyArrayRef. As adding `virtual` would change the performance of +/// the underlying constexpr calls, we rely on apparent-type dispatch for +/// inheritance. This should be fine because their memory format is the same, +/// and it is never incorrect for ArrayRef to call HeaderOnlyArrayRef methods. +/// However, you should prefer to use ArrayRef when possible, because its use +/// of TORCH_CHECK will lead to better user-facing error messages. template -class ArrayRef final { +class ArrayRef final : public HeaderOnlyArrayRef { public: - using iterator = const T*; - using const_iterator = const T*; - using size_type = size_t; - using value_type = T; - - using reverse_iterator = std::reverse_iterator; - - private: - /// The start of the array, in an external buffer. - const T* Data; - - /// The number of elements. - size_type Length; - - void debugCheckNullptrInvariant() { - TORCH_INTERNAL_ASSERT_DEBUG_ONLY( - Data != nullptr || Length == 0, - "created ArrayRef with nullptr and non-zero length! std::optional relies on this being illegal"); - } - - public: - /// @name Constructors + /// @name Constructors, all inherited from HeaderOnlyArrayRef except for + /// SmallVector. As inherited constructors won't work with class template + /// argument deduction (CTAD) until C++23, we add deduction guides after + /// the class definition to enable CTAD. /// @{ - /// Construct an empty ArrayRef. - /* implicit */ constexpr ArrayRef() : Data(nullptr), Length(0) {} - - /// Construct an ArrayRef from a single element. - // TODO Make this explicit - constexpr ArrayRef(const T& OneElt) : Data(&OneElt), Length(1) {} - - /// Construct an ArrayRef from a pointer and length. - constexpr ArrayRef(const T* data, size_t length) - : Data(data), Length(length) { - debugCheckNullptrInvariant(); - } - - /// Construct an ArrayRef from a range. - constexpr ArrayRef(const T* begin, const T* end) - : Data(begin), Length(end - begin) { - debugCheckNullptrInvariant(); - } + using HeaderOnlyArrayRef::HeaderOnlyArrayRef; /// Construct an ArrayRef from a SmallVector. This is templated in order to /// avoid instantiating SmallVectorTemplateCommon whenever we /// copy-construct an ArrayRef. + /// NOTE: this is the only constructor that is not inherited from + /// HeaderOnlyArrayRef. template /* implicit */ ArrayRef(const SmallVectorTemplateCommon& Vec) - : Data(Vec.data()), Length(Vec.size()) { - debugCheckNullptrInvariant(); - } - - template < - typename Container, - typename U = decltype(std::declval().data()), - typename = std::enable_if_t< - (std::is_same_v || std::is_same_v)>> - /* implicit */ ArrayRef(const Container& container) - : Data(container.data()), Length(container.size()) { - debugCheckNullptrInvariant(); - } - - /// Construct an ArrayRef from a std::vector. - // The enable_if stuff here makes sure that this isn't used for - // std::vector, because ArrayRef can't work on a std::vector - // bitfield. - template - /* implicit */ ArrayRef(const std::vector& Vec) - : Data(Vec.data()), Length(Vec.size()) { - static_assert( - !std::is_same_v, - "ArrayRef cannot be constructed from a std::vector bitfield."); - } - - /// Construct an ArrayRef from a std::array - template - /* implicit */ constexpr ArrayRef(const std::array& Arr) - : Data(Arr.data()), Length(N) {} - - /// Construct an ArrayRef from a C array. - template - // NOLINTNEXTLINE(*c-arrays*) - /* implicit */ constexpr ArrayRef(const T (&Arr)[N]) : Data(Arr), Length(N) {} - - /// Construct an ArrayRef from a std::initializer_list. - /* implicit */ constexpr ArrayRef(const std::initializer_list& Vec) - : Data( - std::begin(Vec) == std::end(Vec) ? static_cast(nullptr) - : std::begin(Vec)), - Length(Vec.size()) {} + : HeaderOnlyArrayRef(Vec.data(), Vec.size()) {} /// @} - /// @name Simple Operations + /// @name Simple Operations, mostly inherited from HeaderOnlyArrayRef /// @{ - constexpr iterator begin() const { - return Data; - } - constexpr iterator end() const { - return Data + Length; - } - - // These are actually the same as iterator, since ArrayRef only - // gives you const iterators. - constexpr const_iterator cbegin() const { - return Data; - } - constexpr const_iterator cend() const { - return Data + Length; - } - - constexpr reverse_iterator rbegin() const { - return reverse_iterator(end()); - } - constexpr reverse_iterator rend() const { - return reverse_iterator(begin()); - } - - /// Check if all elements in the array satisfy the given expression - constexpr bool allMatch(const std::function& pred) const { - return std::all_of(cbegin(), cend(), pred); - } - - /// empty - Check if the array is empty. - constexpr bool empty() const { - return Length == 0; - } - - constexpr const T* data() const { - return Data; - } - - /// size - Get the array size. - constexpr size_t size() const { - return Length; - } - /// front - Get the first element. + /// We deviate from HeaderOnlyArrayRef by using TORCH_CHECK instead of + /// STD_TORCH_CHECK constexpr const T& front() const { TORCH_CHECK( - !empty(), "ArrayRef: attempted to access front() of empty list"); - return Data[0]; + !this->empty(), "ArrayRef: attempted to access front() of empty list"); + return this->Data[0]; } /// back - Get the last element. + /// We deviate from HeaderOnlyArrayRef by using TORCH_CHECK instead of + /// STD_TORCH_CHECK constexpr const T& back() const { - TORCH_CHECK(!empty(), "ArrayRef: attempted to access back() of empty list"); - return Data[Length - 1]; - } - - /// equals - Check for element-wise equality. - constexpr bool equals(ArrayRef RHS) const { - return Length == RHS.Length && std::equal(begin(), end(), RHS.begin()); + TORCH_CHECK( + !this->empty(), "ArrayRef: attempted to access back() of empty list"); + return this->Data[this->Length - 1]; } /// slice(n, m) - Take M elements of the array starting at element N + /// We deviate from HeaderOnlyArrayRef by using TORCH_CHECK instead of + /// STD_TORCH_CHECK constexpr ArrayRef slice(size_t N, size_t M) const { TORCH_CHECK( - N + M <= size(), + N + M <= this->size(), "ArrayRef: invalid slice, N = ", N, "; M = ", M, "; size = ", - size()); - return ArrayRef(data() + N, M); + this->size()); + return ArrayRef(this->data() + N, M); } /// slice(n) - Chop off the first N elements of the array. + /// We deviate from HeaderOnlyArrayRef by using TORCH_CHECK instead of + /// STD_TORCH_CHECK constexpr ArrayRef slice(size_t N) const { TORCH_CHECK( - N <= size(), "ArrayRef: invalid slice, N = ", N, "; size = ", size()); - return slice(N, size() - N); + N <= this->size(), + "ArrayRef: invalid slice, N = ", + N, + "; size = ", + this->size()); + return slice(N, this->size() - N); // should this slice be this->slice? } /// @} /// @name Operator Overloads /// @{ - constexpr const T& operator[](size_t Index) const { - return Data[Index]; - } /// Vector compatibility + /// We deviate from HeaderOnlyArrayRef by using TORCH_CHECK instead of + /// STD_TORCH_CHECK constexpr const T& at(size_t Index) const { TORCH_CHECK( - Index < Length, + Index < this->Length, "ArrayRef: invalid index Index = ", Index, "; Length = ", - Length); - return Data[Index]; + this->Length); + return this->Data[Index]; } /// Disallow accidental assignment from a temporary. @@ -253,16 +153,48 @@ class ArrayRef final { std::enable_if_t, ArrayRef>& operator=( std::initializer_list) = delete; - /// @} - /// @name Expensive Operations - /// @{ - std::vector vec() const { - return std::vector(Data, Data + Length); - } - /// @} }; +/// Deduction guides for ArrayRef to support CTAD with inherited constructors +/// These mirror the constructors inherited from HeaderOnlyArrayRef +/// @{ + +// Single element constructor +template +ArrayRef(const T&) -> ArrayRef; + +// Pointer and length constructor +template +ArrayRef(const T*, size_t) -> ArrayRef; + +// Range constructor (begin, end) +template +ArrayRef(const T*, const T*) -> ArrayRef; + +// Generic container constructor (anything with .data() and .size()) +template +ArrayRef(const Container&) -> ArrayRef< + std::remove_pointer_t().data())>>; + +// std::vector constructor +template +ArrayRef(const std::vector&) -> ArrayRef; + +// std::array constructor +template +ArrayRef(const std::array&) -> ArrayRef; + +// C array constructor +template +ArrayRef(const T (&)[N]) -> ArrayRef; + +// std::initializer_list constructor +template +ArrayRef(const std::initializer_list&) -> ArrayRef; + +/// @} + template std::ostream& operator<<(std::ostream& out, ArrayRef list) { int i = 0; diff --git a/c10/util/Exception.h b/c10/util/Exception.h index 6b2fd626bfb5e..28a2ee06ecd3e 100644 --- a/c10/util/Exception.h +++ b/c10/util/Exception.h @@ -702,6 +702,98 @@ namespace c10::detail { #define TORCH_CHECK_ARG(cond, argN, ...) \ TORCH_CHECK(cond, "invalid argument ", argN, ": ", __VA_ARGS__) +#ifndef FATAL_IF +#ifdef C10_USE_GLOG +#define FATAL_IF(condition) \ + condition ? (void)0 \ + : ::c10::LoggerVoidify() & \ + ::c10::MessageLogger(__FILE__, __LINE__, ::google::GLOG_FATAL) \ + .stream() +#else +#define FATAL_IF(condition) \ + condition ? (void)0 \ + : ::c10::LoggerVoidify() & \ + ::c10::MessageLogger(__FILE__, __LINE__, ::c10::GLOG_FATAL).stream() +#endif +#endif + +#ifndef NON_FATAL_IF +#ifdef C10_USE_GLOG +#define NON_FATAL_IF(condition) \ + condition ? (void)0 \ + : ::c10::LoggerVoidify() & \ + ::c10::MessageLogger( \ + __FILE__, __LINE__, ::google::GLOG_FATAL, false) \ + .stream() +#else +#define NON_FATAL_IF(condition) \ + condition ? (void)0 \ + : ::c10::LoggerVoidify() & \ + ::c10::MessageLogger(__FILE__, __LINE__, ::c10::GLOG_FATAL, false) \ + .stream() +#endif +#endif + +// Binary comparison check macros +#define TORCH_CHECK_OP(val1, val2, op) \ + NON_FATAL_IF(((val1)op(val2))) \ + << "Check failed: " #val1 " " #op " " #val2 " (" << (val1) << " vs. " \ + << (val2) << "). " + +#define TORCH_DCHECK_OP(val1, val2, op) \ + FATAL_IF(((val1)op(val2))) << "Check failed: " #val1 " " #op " " #val2 " (" \ + << (val1) << " vs. " << (val2) << "). " + +#define TORCH_CHECK_EQ(val1, val2) TORCH_CHECK_OP(val1, val2, ==) +#define TORCH_CHECK_NE(val1, val2) TORCH_CHECK_OP(val1, val2, !=) +#define TORCH_CHECK_LE(val1, val2) TORCH_CHECK_OP(val1, val2, <=) +#define TORCH_CHECK_LT(val1, val2) TORCH_CHECK_OP(val1, val2, <) +#define TORCH_CHECK_GE(val1, val2) TORCH_CHECK_OP(val1, val2, >=) +#define TORCH_CHECK_GT(val1, val2) TORCH_CHECK_OP(val1, val2, >) + +// Debug versions of TORCH_CHECK_OP macros +#ifndef NDEBUG +#define TORCH_DCHECK_EQ(val1, val2) TORCH_DCHECK_OP(val1, val2, ==) +#define TORCH_DCHECK_NE(val1, val2) TORCH_DCHECK_OP(val1, val2, !=) +#define TORCH_DCHECK_LE(val1, val2) TORCH_DCHECK_OP(val1, val2, <=) +#define TORCH_DCHECK_LT(val1, val2) TORCH_DCHECK_OP(val1, val2, <) +#define TORCH_DCHECK_GE(val1, val2) TORCH_DCHECK_OP(val1, val2, >=) +#define TORCH_DCHECK_GT(val1, val2) TORCH_DCHECK_OP(val1, val2, >) +#else // !NDEBUG +// Optimized versions - generate no code +#define TORCH_DCHECK_EQ(val1, val2) \ + while (false) \ + TORCH_DCHECK_OP(val1, val2, ==) +#define TORCH_DCHECK_NE(val1, val2) \ + while (false) \ + TORCH_DCHECK_OP(val1, val2, !=) +#define TORCH_DCHECK_LE(val1, val2) \ + while (false) \ + TORCH_DCHECK_OP(val1, val2, <=) +#define TORCH_DCHECK_LT(val1, val2) \ + while (false) \ + TORCH_DCHECK_OP(val1, val2, <) +#define TORCH_DCHECK_GE(val1, val2) \ + while (false) \ + TORCH_DCHECK_OP(val1, val2, >=) +#define TORCH_DCHECK_GT(val1, val2) \ + while (false) \ + TORCH_DCHECK_OP(val1, val2, >) +#endif // NDEBUG + +// Null pointer check macro +#define TORCH_CHECK_NOTNULL(val) \ + ::c10::CheckNotNull(__FILE__, __LINE__, #val, (val), false) + +#ifndef NDEBUG +#define TORCH_DCHECK_NOTNULL(val) \ + ::c10::CheckNotNull(__FILE__, __LINE__, #val, (val), true) +#else // !NDEBUG +#define TORCH_DCHECK_NOTNULL(val) \ + while (false) \ + TORCH_CHECK_NOTNULL(val) +#endif // NDEBUG + // ---------------------------------------------------------------------------- // Deprecated macros // ---------------------------------------------------------------------------- diff --git a/c10/util/Logging.cpp b/c10/util/Logging.cpp index 555ab685c0b5f..4bf96b1b6808a 100644 --- a/c10/util/Logging.cpp +++ b/c10/util/Logging.cpp @@ -291,6 +291,32 @@ namespace c10 { using fLB::FLAGS_logtostderr; using fLI::FLAGS_minloglevel; using fLI::FLAGS_v; + +MessageLogger::MessageLogger( + const char* file, + int line, + int severity, + bool exit_on_fatal) + : stream_(), severity_(severity), exit_on_fatal_(exit_on_fatal) {} + +MessageLogger::~MessageLogger() noexcept(false) { + if (severity_ == ::google::GLOG_FATAL) { + DealWithFatal(); + } +} + +std::stringstream& MessageLogger::stream() { + return stream_; +} + +void MessageLogger::DealWithFatal() { + if (exit_on_fatal_) { + LOG(FATAL) << stream_.str(); + } else { + throw c10::Error(stream_.str(), nullptr, nullptr); + } +} + } // namespace c10 C10_DEFINE_int( @@ -412,17 +438,16 @@ void ShowLogInfoToStderr() { FLAGS_caffe2_log_level = GLOG_INFO; } -MessageLogger::MessageLogger(const char* file, int line, int severity) - : severity_(severity) { +MessageLogger::MessageLogger( + const char* file, + int line, + int severity, + bool exit_on_fatal) + : severity_(severity), exit_on_fatal_(exit_on_fatal) { if (severity_ < FLAGS_caffe2_log_level) { // Nothing needs to be logged. return; } -#ifdef ANDROID - tag_ = "native"; -#else // !ANDROID - tag_ = ""; -#endif // ANDROID time_t rawtime = 0; time(&rawtime); @@ -458,7 +483,7 @@ MessageLogger::MessageLogger(const char* file, int line, int severity) } // Output the contents of the stream to the proper channel on destruction. -MessageLogger::~MessageLogger() { +MessageLogger::~MessageLogger() noexcept(false) { if (severity_ < FLAGS_caffe2_log_level) { // Nothing needs to be logged. return; @@ -498,6 +523,18 @@ MessageLogger::~MessageLogger() { } } +std::stringstream& MessageLogger::stream() { + return stream_; +} + +void MessageLogger::DealWithFatal() { + if (exit_on_fatal_) { + abort(); + } else { + throw c10::Error(stream_.str(), nullptr, nullptr); + } +} + } // namespace c10 #endif // !C10_USE_GLOG diff --git a/c10/util/Metaprogramming.cpp b/c10/util/Metaprogramming.cpp deleted file mode 100644 index f6ee24a79bcd8..0000000000000 --- a/c10/util/Metaprogramming.cpp +++ /dev/null @@ -1 +0,0 @@ -#include diff --git a/c10/util/Metaprogramming.h b/c10/util/Metaprogramming.h index d504706f3283a..a5912706e1ed1 100644 --- a/c10/util/Metaprogramming.h +++ b/c10/util/Metaprogramming.h @@ -1,224 +1 @@ -#pragma once - -#include -#include - -namespace c10::guts { - -/** - * Access information about result type or arguments from a function type. - * Example: - * using A = function_traits::return_type // A == int - * using A = function_traits::parameter_types::tuple_type - * // A == tuple - */ -template -struct function_traits { - static_assert( - !std::is_same_v, - "In function_traits, Func must be a plain function type."); -}; -template -struct function_traits { - using func_type = Result(Args...); - using return_type = Result; - using parameter_types = typelist::typelist; - static constexpr auto number_of_parameters = sizeof...(Args); -}; - -/** - * infer_function_traits: creates a `function_traits` type for a simple - * function (pointer) or functor (lambda/struct). Currently does not support - * class methods. - */ - -template -struct infer_function_traits { - using type = function_traits< - c10::guts::detail::strip_class_t>; -}; - -template -struct infer_function_traits { - using type = function_traits; -}; - -template -struct infer_function_traits { - using type = function_traits; -}; - -template -using infer_function_traits_t = typename infer_function_traits::type; - -/** - * make_function_traits: creates a `function_traits` type given a Return type - * and a typelist of Argument types - * - * Example: - * bool f(int, int); - * - * infer_function_traits_t == make_function_traits_t> - */ -template -struct make_function_traits { - static_assert( - false_t::value, - "In guts::make_function_traits, the ArgList argument must be typelist<...>."); -}; - -template -struct make_function_traits> { - using type = function_traits; -}; - -template -using make_function_traits_t = - typename make_function_traits::type; - -/** - * make_offset_index_sequence - * Like make_index_sequence, but starting from Start instead of 0. - * - * Example: - * make_offset_index_sequence<10, 3> == std::index_sequence<10, 11, 12> - */ -template -struct make_offset_index_sequence_impl - : make_offset_index_sequence_impl { - static_assert( - static_cast(Start) >= 0, - "make_offset_index_sequence: Start < 0"); - static_assert(static_cast(N) >= 0, "make_offset_index_sequence: N < 0"); -}; - -template -struct make_offset_index_sequence_impl { - typedef std::index_sequence type; -}; - -template -using make_offset_index_sequence = - typename make_offset_index_sequence_impl::type; - -/** - * Use tuple_elements to extract a position-indexed subset of elements - * from the argument tuple into a result tuple. - * - * Example: - * std::tuple t = std::make_tuple(0, "HEY", 2.0); - * std::tuple result = tuple_elements(t, std::index_sequence<0, - * 2>()); - */ -template -constexpr auto tuple_elements(Tuple t, std::index_sequence /*unused*/) { - return std::tuple...>(std::get(t)...); -} - -/** - * Use tuple_take to extract the first or last n elements from the argument - * tuple into a result tuple. - * - * Example: - * std::tuple t = std::make_tuple(0, "HEY", 2.0); - * std::tuple first_two = tuple_take(t); - * std::tuple last_two = tuple_take(t); - */ -template -struct TupleTake {}; - -template -struct TupleTake= 0, void>> { - static auto call(Tuple t) { - constexpr size_t size = std::tuple_size(); - static_assert(N <= size, "tuple_take: N > size"); - return tuple_elements(t, std::make_index_sequence{}); - } -}; - -template - struct TupleTake < Tuple, - N, std::enable_if_t> { - static auto call(Tuple t) { - constexpr size_t size = std::tuple_size(); - static_assert(-N <= size, "tuple_take: -N > size"); - return tuple_elements(t, make_offset_index_sequence{}); - } -}; - -template -auto tuple_take(Tuple t) { - return TupleTake::call(t); -} - -/** - * Use tuple_slice to extract a contiguous subtuple from the argument. - * - * Example: - * std::tuple t = std::make_tuple(0, - * "HEY", 2.0, false); std::tuple middle_two = - * tuple_slice(t); - */ -template -constexpr auto tuple_slice(Tuple t) { - constexpr size_t size = std::tuple_size(); - static_assert(Start + N <= size, "tuple_slice: Start + N > size"); - return tuple_elements(t, make_offset_index_sequence{}); -} - -/** - * Use tuple_map to run a mapping function over a tuple to get a new tuple. - * - * Example 1: - * auto result = tuple_map(std::tuple(3, 4, 5), [] - * (int32_t a) -> int16_t {return a+1;}); - * // result == std::tuple(4, 5, 6) - * - * Example 2: - * struct Mapper { - * std::string operator()(int32_t a) const { - * return std::to_string(a); - * } - * int64_t operator()(const std::string& a) const { - * return atoi(a.c_str()); - * } - * }; - * auto result = tuple_map(std::tuple(3, "4"), - * Mapper()); - * // result == std::tuple("3", 4) - * - * Example 3: - * struct A final { - * int32_t func() { - * return 5; - * } - * }; - * struct B final { - * std::string func() { - * return "5"; - * } - * }; - * auto result = tuple_map(std::make_tuple(A(), B()), [] (auto a) { return - * a.func(); }); - * // result == std::tuple(5, "5"); - */ -namespace detail { -template -auto tuple_map( - // NOLINTNEXTLINE(cppcoreguidelines-rvalue-reference-param-not-moved) - std::tuple&& tuple, - const Mapper& mapper, - std::index_sequence /*unused*/) { - return std::tuple(std::get( - tuple))))...>(mapper(std::forward(std::get(tuple)))...); -} -} // namespace detail - -template -auto tuple_map(std::tuple&& tuple, const Mapper& mapper) { - return detail::tuple_map( - std::move(tuple), mapper, std::index_sequence_for()); -} - -} // namespace c10::guts +#include diff --git a/c10/util/TypeList.h b/c10/util/TypeList.h index 244e5bb141cd7..9f79099710d71 100644 --- a/c10/util/TypeList.h +++ b/c10/util/TypeList.h @@ -1,515 +1 @@ -#pragma once - -#include -#include -#include -#include -#include -#include - -namespace c10::guts { - -template -struct false_t : std::false_type {}; -template