diff --git a/.ci/docker/almalinux/Dockerfile b/.ci/docker/almalinux/Dockerfile index ce7803cf9acd2..3bc3fd8badc6d 100644 --- a/.ci/docker/almalinux/Dockerfile +++ b/.ci/docker/almalinux/Dockerfile @@ -7,13 +7,13 @@ ENV LC_ALL en_US.UTF-8 ENV LANG en_US.UTF-8 ENV LANGUAGE en_US.UTF-8 -ARG DEVTOOLSET_VERSION=11 +ARG DEVTOOLSET_VERSION=13 RUN yum -y update RUN yum -y install epel-release # install glibc-langpack-en make sure en_US.UTF-8 locale is available RUN yum -y install glibc-langpack-en -RUN yum install -y sudo wget curl perl util-linux xz bzip2 git patch which perl zlib-devel openssl-devel yum-utils autoconf automake make gcc-toolset-${DEVTOOLSET_VERSION}-toolchain +RUN yum install -y sudo wget curl perl util-linux xz bzip2 git patch which perl zlib-devel openssl-devel yum-utils autoconf automake make gcc-toolset-${DEVTOOLSET_VERSION}-gcc gcc-toolset-${DEVTOOLSET_VERSION}-gcc-c++ gcc-toolset-${DEVTOOLSET_VERSION}-gcc-gfortran gcc-toolset-${DEVTOOLSET_VERSION}-gdb # Just add everything as a safe.directory for git since these will be used in multiple places with git RUN git config --global --add safe.directory '*' ENV PATH=/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/bin:$PATH @@ -41,6 +41,7 @@ RUN bash ./install_conda.sh && rm install_conda.sh # Install CUDA FROM base as cuda ARG CUDA_VERSION=12.6 +ARG DEVTOOLSET_VERSION=13 RUN rm -rf /usr/local/cuda-* ADD ./common/install_cuda.sh install_cuda.sh COPY ./common/install_nccl.sh install_nccl.sh @@ -50,7 +51,8 @@ ENV CUDA_HOME=/usr/local/cuda-${CUDA_VERSION} # Preserve CUDA_VERSION for the builds ENV CUDA_VERSION=${CUDA_VERSION} # Make things in our path by default -ENV PATH=/usr/local/cuda-${CUDA_VERSION}/bin:$PATH +ENV PATH=/usr/local/cuda-${CUDA_VERSION}/bin:/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/bin:$PATH + FROM cuda as cuda12.6 RUN bash ./install_cuda.sh 12.6 @@ -68,8 +70,22 @@ FROM cuda as cuda13.0 RUN bash ./install_cuda.sh 13.0 ENV DESIRED_CUDA=13.0 -FROM ${ROCM_IMAGE} as rocm +FROM ${ROCM_IMAGE} as rocm_base +ARG DEVTOOLSET_VERSION=13 +ENV LC_ALL en_US.UTF-8 +ENV LANG en_US.UTF-8 +ENV LANGUAGE en_US.UTF-8 +# Install devtoolset on ROCm base image +RUN yum -y update && \ + yum -y install epel-release && \ + yum -y install glibc-langpack-en && \ + yum install -y sudo wget curl perl util-linux xz bzip2 git patch which perl zlib-devel openssl-devel yum-utils autoconf automake make gcc-toolset-${DEVTOOLSET_VERSION}-gcc gcc-toolset-${DEVTOOLSET_VERSION}-gcc-c++ gcc-toolset-${DEVTOOLSET_VERSION}-gcc-gfortran gcc-toolset-${DEVTOOLSET_VERSION}-gdb +RUN git config --global --add safe.directory '*' +ENV PATH=/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/bin:$PATH + +FROM rocm_base as rocm ARG PYTORCH_ROCM_ARCH +ARG DEVTOOLSET_VERSION=13 ENV PYTORCH_ROCM_ARCH ${PYTORCH_ROCM_ARCH} ADD ./common/install_mkl.sh install_mkl.sh RUN bash ./install_mkl.sh && rm install_mkl.sh @@ -88,6 +104,7 @@ COPY --from=cuda13.0 /usr/local/cuda-13.0 /usr/local/cuda-13.0 # Final step FROM ${BASE_TARGET} as final +ARG DEVTOOLSET_VERSION=13 COPY --from=openssl /opt/openssl /opt/openssl COPY --from=patchelf /patchelf /usr/local/bin/patchelf COPY --from=conda /opt/conda /opt/conda diff --git a/.ci/docker/almalinux/build.sh b/.ci/docker/almalinux/build.sh index ad234ce1ffb93..468f9b06418f7 100755 --- a/.ci/docker/almalinux/build.sh +++ b/.ci/docker/almalinux/build.sh @@ -36,11 +36,7 @@ case ${DOCKER_TAG_PREFIX} in ;; rocm*) BASE_TARGET=rocm - PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201" - # add gfx950, gfx115x conditionally starting in ROCm 7.0 - if [[ "$ROCM_VERSION" == *"7.0"* ]]; then - PYTORCH_ROCM_ARCH="${PYTORCH_ROCM_ARCH};gfx950;gfx1150;gfx1151" - fi + PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201;gfx950;gfx1150;gfx1151" EXTRA_BUILD_ARGS="${EXTRA_BUILD_ARGS} --build-arg PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH}" ;; *) @@ -63,7 +59,7 @@ docker build \ --target final \ --progress plain \ --build-arg "BASE_TARGET=${BASE_TARGET}" \ - --build-arg "DEVTOOLSET_VERSION=11" \ + --build-arg "DEVTOOLSET_VERSION=13" \ ${EXTRA_BUILD_ARGS} \ -t ${tmp_tag} \ $@ \ diff --git a/.ci/docker/build.sh b/.ci/docker/build.sh index d0500b89780ce..b7e61115e37d6 100755 --- a/.ci/docker/build.sh +++ b/.ci/docker/build.sh @@ -168,6 +168,18 @@ case "$tag" in VISION=yes TRITON=yes ;; + pytorch-linux-jammy-py3.11-clang12) + ANACONDA_PYTHON_VERSION=3.11 + CLANG_VERSION=12 + VISION=no + TRITON=no + ;; + pytorch-linux-jammy-py3.12-clang12) + ANACONDA_PYTHON_VERSION=3.12 + CLANG_VERSION=12 + VISION=no + TRITON=no + ;; pytorch-linux-jammy-rocm-n-py3 | pytorch-linux-jammy-rocm-n-py3-benchmarks | pytorch-linux-noble-rocm-n-py3) if [[ $tag =~ "jammy" ]]; then ANACONDA_PYTHON_VERSION=3.10 @@ -176,7 +188,7 @@ case "$tag" in fi GCC_VERSION=11 VISION=yes - ROCM_VERSION=7.0 + ROCM_VERSION=7.1 NINJA_VERSION=1.9.0 TRITON=yes KATEX=yes @@ -195,9 +207,9 @@ case "$tag" in NINJA_VERSION=1.9.0 TRITON=yes ;; - pytorch-linux-jammy-xpu-n-py3 | pytorch-linux-jammy-xpu-n-py3-inductor-benchmarks) + pytorch-linux-noble-xpu-n-py3 | pytorch-linux-noble-xpu-n-py3-inductor-benchmarks) ANACONDA_PYTHON_VERSION=3.10 - GCC_VERSION=11 + GCC_VERSION=13 VISION=yes XPU_VERSION=2025.2 NINJA_VERSION=1.9.0 @@ -248,6 +260,12 @@ case "$tag" in HALIDE=yes TRITON=yes ;; + pytorch-linux-jammy-cuda12.8-py3.12-pallas) + CUDA_VERSION=12.8.1 + ANACONDA_PYTHON_VERSION=3.12 + GCC_VERSION=11 + PALLAS=yes + ;; pytorch-linux-jammy-py3.12-triton-cpu) CUDA_VERSION=12.6 ANACONDA_PYTHON_VERSION=3.12 @@ -261,9 +279,9 @@ case "$tag" in PYTHON_VERSION=3.10 CUDA_VERSION=12.8.1 ;; - pytorch-linux-jammy-aarch64-py3.10-gcc11) + pytorch-linux-jammy-aarch64-py3.10-gcc13) ANACONDA_PYTHON_VERSION=3.10 - GCC_VERSION=11 + GCC_VERSION=13 ACL=yes VISION=yes OPENBLAS=yes @@ -271,9 +289,19 @@ case "$tag" in # from pytorch/llvm:9.0.1 is x86 specific SKIP_LLVM_SRC_BUILD_INSTALL=yes ;; - pytorch-linux-jammy-aarch64-py3.10-gcc11-inductor-benchmarks) + pytorch-linux-jammy-aarch64-py3.10-clang21) ANACONDA_PYTHON_VERSION=3.10 - GCC_VERSION=11 + CLANG_VERSION=21 + ACL=yes + VISION=yes + OPENBLAS=yes + # snadampal: skipping llvm src build install because the current version + # from pytorch/llvm:9.0.1 is x86 specific + SKIP_LLVM_SRC_BUILD_INSTALL=yes + ;; + pytorch-linux-jammy-aarch64-py3.10-gcc13-inductor-benchmarks) + ANACONDA_PYTHON_VERSION=3.10 + GCC_VERSION=13 ACL=yes VISION=yes OPENBLAS=yes @@ -359,6 +387,7 @@ docker build \ --build-arg "INDUCTOR_BENCHMARKS=${INDUCTOR_BENCHMARKS}" \ --build-arg "EXECUTORCH=${EXECUTORCH}" \ --build-arg "HALIDE=${HALIDE}" \ + --build-arg "PALLAS=${PALLAS}" \ --build-arg "XPU_VERSION=${XPU_VERSION}" \ --build-arg "UNINSTALL_DILL=${UNINSTALL_DILL}" \ --build-arg "ACL=${ACL:-}" \ diff --git a/.ci/docker/ci_commit_pins/jax.txt b/.ci/docker/ci_commit_pins/jax.txt new file mode 100644 index 0000000000000..a3df0a6959e15 --- /dev/null +++ b/.ci/docker/ci_commit_pins/jax.txt @@ -0,0 +1 @@ +0.8.0 diff --git a/.ci/docker/ci_commit_pins/triton.txt b/.ci/docker/ci_commit_pins/triton.txt index d893bdd32ab34..8fcbc3de469f4 100644 --- a/.ci/docker/ci_commit_pins/triton.txt +++ b/.ci/docker/ci_commit_pins/triton.txt @@ -1 +1 @@ -ac80c4190aa0321f761a08af97e1e1eee41f01d9 +5df9c723de8c23508773b07fe16dd34e4c444541 diff --git a/.ci/docker/common/install_clang.sh b/.ci/docker/common/install_clang.sh index 1cb216edf1b38..93daeee919b3d 100755 --- a/.ci/docker/common/install_clang.sh +++ b/.ci/docker/common/install_clang.sh @@ -8,8 +8,8 @@ if [ -n "$CLANG_VERSION" ]; then # work around ubuntu apt-get conflicts sudo apt-get -y -f install wget --no-check-certificate -O - https://apt.llvm.org/llvm-snapshot.gpg.key | sudo apt-key add - - if [[ $CLANG_VERSION == 18 ]]; then - apt-add-repository "deb http://apt.llvm.org/jammy/ llvm-toolchain-jammy-18 main" + if [[ $CLANG_VERSION -ge 18 ]]; then + apt-add-repository "deb http://apt.llvm.org/jammy/ llvm-toolchain-jammy-${CLANG_VERSION} main" fi fi diff --git a/.ci/docker/common/install_gcc.sh b/.ci/docker/common/install_gcc.sh index 3b96bf6e0ed2f..df1c059bc3869 100644 --- a/.ci/docker/common/install_gcc.sh +++ b/.ci/docker/common/install_gcc.sh @@ -7,11 +7,11 @@ if [ -n "$GCC_VERSION" ]; then # Need the official toolchain repo to get alternate packages add-apt-repository ppa:ubuntu-toolchain-r/test apt-get update - apt-get install -y g++-$GCC_VERSION + apt-get install -y g++-$GCC_VERSION gfortran-$GCC_VERSION update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-"$GCC_VERSION" 50 update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-"$GCC_VERSION" 50 update-alternatives --install /usr/bin/gcov gcov /usr/bin/gcov-"$GCC_VERSION" 50 - + update-alternatives --install /usr/bin/gfortran gfortran /usr/bin/gfortran-"$GCC_VERSION" 50 # Cleanup package manager apt-get autoclean && apt-get clean diff --git a/.ci/docker/common/install_jax.sh b/.ci/docker/common/install_jax.sh new file mode 100755 index 0000000000000..184aedf0f94fe --- /dev/null +++ b/.ci/docker/common/install_jax.sh @@ -0,0 +1,40 @@ +#!/bin/bash + +set -ex + +source "$(dirname "${BASH_SOURCE[0]}")/common_utils.sh" + +# Get the pinned JAX version (same for all CUDA versions) +JAX_VERSION=$(get_pinned_commit /ci_commit_pins/jax) + +function install_jax_12() { + echo "Installing JAX ${JAX_VERSION} with CUDA 12 support" + pip_install "jax[cuda12]==${JAX_VERSION}" -f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html + + # Verify installation + python -c "import jax" # check for errors + echo "JAX ${JAX_VERSION} installation completed successfully for CUDA 12" +} + +function install_jax_13() { + echo "Installing JAX ${JAX_VERSION} with CUDA 13 support" + pip_install "jax[cuda13]==${JAX_VERSION}" -f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html + + # Verify installation + python -c "import jax" # check for errors + echo "JAX ${JAX_VERSION} installation completed successfully for CUDA 13" +} + +# idiomatic parameter and option handling in sh +while test $# -gt 0 +do + case "$1" in + 12.4|12.6|12.6.*|12.8|12.8.*|12.9|12.9.*) install_jax_12; + ;; + 13.0|13.0.*) install_jax_13; + ;; + *) echo "bad argument $1"; exit 1 + ;; + esac + shift +done diff --git a/.ci/docker/common/install_libgomp.sh b/.ci/docker/common/install_libgomp.sh new file mode 100644 index 0000000000000..308915ec4f618 --- /dev/null +++ b/.ci/docker/common/install_libgomp.sh @@ -0,0 +1,56 @@ +#!/bin/bash +# Script used only in CD pipeline + +set -ex + +# install dependencies +dnf -y install gmp-devel libmpc-devel texinfo flex bison + +cd /usr/local/src +# fetch source for gcc 13 +git clone --depth 1 --single-branch -b releases/gcc-13.3.0 https://github.com/gcc-mirror/gcc.git gcc-13.3.0 + +mkdir -p gcc-13.3.0/build-gomp +cd gcc-13.3.0/build-gomp + +# configure gcc build +# I got these flags by: +# 1. downloading the source rpm for gcc-11 on AlmaLinux 8 container +# dnf install -y dnf-plugins-core rpmdevtools +# dnf download --source libgomp +# 2. extracting the gcc.spec from the source. +# rpmdev-extract gcc-xx.src.rpm +# 3. extracting optflags and ld_flags from gcc.spec: +# rpm --eval '%{optflags}' +# rpm --eval '%{build_ldflags}' +# +# I had to remove the following flags because they didn't compile for this version of libgomp: +# -Werror=format-security +# -specs=/usr/lib/rpm/redhat/redhat-hardened-cc1 +# -specs=/usr/lib/rpm/redhat/redhat-annobin-cc1 +# +# I added -march=armv8-a -mtune=generic to make them explicit. I don't think they're strictly needed. + +OPT_FLAGS='-O2 -march=armv8-a -mtune=generic'\ +' -fexceptions -g -grecord-gcc-switches -pipe -Wall'\ +' -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS'\ +' -fstack-protector-strong -fasynchronous-unwind-tables'\ +' -fstack-clash-protection' + +LDFLAGS='-Wl,-z,relro -Wl,--as-needed -Wl,-z,now' + +CFLAGS="$OPT_FLAGS" \ +CXXFLAGS="$OPT_FLAGS" \ +LDFLAGS="$LDFLAGS" \ +../configure \ + --prefix=/usr \ + --libdir=/usr/lib64 \ + --enable-languages=c,c++ \ + --disable-multilib \ + --disable-bootstrap \ + --enable-libgomp + +# only build libgomp +make -j$(nproc) all-target-libgomp + +make install-target-libgomp \ No newline at end of file diff --git a/.ci/docker/common/install_openblas.sh b/.ci/docker/common/install_openblas.sh index 2f386c6bd523a..5a28068781245 100755 --- a/.ci/docker/common/install_openblas.sh +++ b/.ci/docker/common/install_openblas.sh @@ -10,6 +10,7 @@ git clone https://github.com/OpenMathLib/OpenBLAS.git -b "${OPENBLAS_VERSION}" - OPENBLAS_CHECKOUT_DIR="OpenBLAS" OPENBLAS_BUILD_FLAGS=" +CC=gcc NUM_THREADS=128 USE_OPENMP=1 NO_SHARED=0 diff --git a/.ci/docker/common/install_rocm.sh b/.ci/docker/common/install_rocm.sh index 9376d259d9cca..988347e28e9d8 100644 --- a/.ci/docker/common/install_rocm.sh +++ b/.ci/docker/common/install_rocm.sh @@ -60,14 +60,16 @@ EOF DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated rocm-llvm-dev fi - # precompiled miopen kernels added in ROCm 3.5, renamed in ROCm 5.5 - # search for all unversioned packages - # if search fails it will abort this script; use true to avoid case where search fails - MIOPENHIPGFX=$(apt-cache search --names-only miopen-hip-gfx | awk '{print $1}' | grep -F -v . || true) - if [[ "x${MIOPENHIPGFX}" = x ]]; then - echo "miopen-hip-gfx package not available" && exit 1 - else - DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated ${MIOPENHIPGFX} + if [[ $(ver $ROCM_VERSION) -lt $(ver 7.1) ]]; then + # precompiled miopen kernels added in ROCm 3.5, renamed in ROCm 5.5, removed in ROCm 7.1 + # search for all unversioned packages + # if search fails it will abort this script; use true to avoid case where search fails + MIOPENHIPGFX=$(apt-cache search --names-only miopen-hip-gfx | awk '{print $1}' | grep -F -v . || true) + if [[ "x${MIOPENHIPGFX}" = x ]]; then + echo "miopen-hip-gfx package not available" && exit 1 + else + DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated ${MIOPENHIPGFX} + fi fi # ROCm 6.0 had a regression where journal_mode was enabled on the kdb files resulting in permission errors at runtime diff --git a/.ci/docker/common/install_rocm_magma.sh b/.ci/docker/common/install_rocm_magma.sh index 2d03c6186b8e5..9bf45e6f1b0a9 100644 --- a/.ci/docker/common/install_rocm_magma.sh +++ b/.ci/docker/common/install_rocm_magma.sh @@ -12,8 +12,8 @@ function do_install() { rocm_version_nodot=${rocm_version//./} - # post merge of https://github.com/icl-utk-edu/magma/pull/65 - MAGMA_VERSION=c0792ae825fb36872784892ea643dd6f3456bc5f + # https://github.com/icl-utk-edu/magma/pull/65 + MAGMA_VERSION=d6e4117bc88e73f06d26c6c2e14f064e8fc3d1ec magma_archive="magma-rocm${rocm_version_nodot}-${MAGMA_VERSION}-1.tar.bz2" rocm_dir="/opt/rocm" diff --git a/.ci/docker/common/install_xpu.sh b/.ci/docker/common/install_xpu.sh index 0b150872f93ce..22b7af890c1f6 100644 --- a/.ci/docker/common/install_xpu.sh +++ b/.ci/docker/common/install_xpu.sh @@ -9,7 +9,7 @@ set -xe function install_ubuntu() { . /etc/os-release - if [[ ! " jammy " =~ " ${VERSION_CODENAME} " ]]; then + if [[ ! " jammy noble " =~ " ${VERSION_CODENAME} " ]]; then echo "Ubuntu version ${VERSION_CODENAME} not supported" exit fi @@ -35,25 +35,24 @@ function install_ubuntu() { # The xpu-smi packages apt-get install -y flex bison xpu-smi - if [[ "${XPU_DRIVER_TYPE,,}" == "lts" ]]; then - # Compute and Media Runtimes + # Compute and Media Runtimes + if [[ " ${VERSION_CODENAME} " =~ " noble " ]]; then apt-get install -y \ - intel-opencl-icd intel-level-zero-gpu level-zero \ - intel-media-va-driver-non-free libmfx1 libmfxgen1 libvpl2 \ - libegl-mesa0 libegl1-mesa libegl1-mesa-dev libgbm1 libgl1-mesa-dev libgl1-mesa-dri \ + intel-opencl-icd libze-intel-gpu1 libze1 \ + intel-media-va-driver-non-free libmfx-gen1 libvpl2 \ + libegl-mesa0 libegl1-mesa-dev libgbm1 libgl1-mesa-dev libgl1-mesa-dri \ libglapi-mesa libgles2-mesa-dev libglx-mesa0 libigdgmm12 libxatracker2 mesa-va-drivers \ - mesa-vdpau-drivers mesa-vulkan-drivers va-driver-all vainfo hwinfo clinfo - # Development Packages - apt-get install -y libigc-dev intel-igc-cm libigdfcl-dev libigfxcmrt-dev level-zero-dev - else # rolling driver + mesa-vdpau-drivers mesa-vulkan-drivers va-driver-all vainfo hwinfo clinfo intel-ocloc + else # jammy apt-get install -y \ intel-opencl-icd libze-intel-gpu1 libze1 \ intel-media-va-driver-non-free libmfx-gen1 libvpl2 \ libegl-mesa0 libegl1-mesa libegl1-mesa-dev libgbm1 libgl1-mesa-dev libgl1-mesa-dri \ libglapi-mesa libglx-mesa0 libigdgmm12 libxatracker2 mesa-va-drivers \ mesa-vdpau-drivers mesa-vulkan-drivers va-driver-all vainfo hwinfo clinfo intel-ocloc - apt-get install -y libigc-dev intel-igc-cm libigdfcl-dev libigfxcmrt-dev libze-dev fi + # Development Packages + apt-get install -y libigc-dev intel-igc-cm libigdfcl-dev libigfxcmrt-dev libze-dev # Install Intel Support Packages apt-get install -y ${XPU_PACKAGES} @@ -66,7 +65,7 @@ function install_ubuntu() { function install_rhel() { . /etc/os-release if [[ "${ID}" == "rhel" ]]; then - if [[ ! " 8.8 8.9 9.0 9.2 9.3 " =~ " ${VERSION_ID} " ]]; then + if [[ ! " 8.8 8.10 9.0 9.2 9.3 " =~ " ${VERSION_ID} " ]]; then echo "RHEL version ${VERSION_ID} not supported" exit fi @@ -147,7 +146,7 @@ function install_sles() { XPU_DRIVER_VERSION="" if [[ "${XPU_DRIVER_TYPE,,}" == "lts" ]]; then # Use GPU driver LTS releases - XPU_DRIVER_VERSION="/lts/2350" + XPU_DRIVER_VERSION="/lts/2523" fi # Default use Intel® oneAPI Deep Learning Essentials 2025.1 diff --git a/.ci/docker/libtorch/build.sh b/.ci/docker/libtorch/build.sh index c40896cb5499f..76d3e01e1c38f 100755 --- a/.ci/docker/libtorch/build.sh +++ b/.ci/docker/libtorch/build.sh @@ -49,11 +49,7 @@ case ${DOCKER_TAG_PREFIX} in fi BASE_TARGET=rocm GPU_IMAGE=rocm/dev-ubuntu-22.04:${GPU_ARCH_VERSION}-complete - PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201" - # add gfx950, gfx115x conditionally starting in ROCm 7.0 - if [[ "$GPU_ARCH_VERSION" == *"7.0"* ]]; then - PYTORCH_ROCM_ARCH="${PYTORCH_ROCM_ARCH};gfx950;gfx1150;gfx1151" - fi + PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201;gfx950;gfx1150;gfx1151" DOCKER_GPU_BUILD_ARG="--build-arg PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH} --build-arg ROCM_VERSION=${GPU_ARCH_VERSION}" ;; *) diff --git a/.ci/docker/manywheel/Dockerfile_2_28_aarch64 b/.ci/docker/manywheel/Dockerfile_2_28_aarch64 index 768db09929361..78ee09d128cb0 100644 --- a/.ci/docker/manywheel/Dockerfile_2_28_aarch64 +++ b/.ci/docker/manywheel/Dockerfile_2_28_aarch64 @@ -50,6 +50,10 @@ RUN rm install_ninja.sh ENV PATH=/opt/rh/gcc-toolset-${GCCTOOLSET_VERSION}/root/usr/bin:$PATH ENV LD_LIBRARY_PATH=/opt/rh/gcc-toolset-${GCCTOOLSET_VERSION}/root/usr/lib64:/opt/rh/gcc-toolset-${GCCTOOLSET_VERSION}/root/usr/lib:$LD_LIBRARY_PATH +# Build a newer version of libgomp than that supported in in Almalinux 8. +COPY ./common/install_libgomp.sh install_libgomp.sh +RUN bash ./install_libgomp.sh && rm install_libgomp.sh + # git236+ would refuse to run git commands in repos owned by other users # Which causes version check to fail, as pytorch repo is bind-mounted into the image # Override this behaviour by treating every folder as safe diff --git a/.ci/docker/manywheel/build.sh b/.ci/docker/manywheel/build.sh index ac385ce4b29fd..8f9059dc0cc12 100755 --- a/.ci/docker/manywheel/build.sh +++ b/.ci/docker/manywheel/build.sh @@ -87,11 +87,7 @@ case ${image} in MANY_LINUX_VERSION="2_28" DEVTOOLSET_VERSION="11" GPU_IMAGE=rocm/dev-almalinux-8:${GPU_ARCH_VERSION}-complete - PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201" - # add gfx950, gfx115x conditionally starting in ROCm 7.0 - if [[ "$GPU_ARCH_VERSION" == *"7.0"* ]]; then - PYTORCH_ROCM_ARCH="${PYTORCH_ROCM_ARCH};gfx950;gfx1150;gfx1151" - fi + PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201;gfx950;gfx1150;gfx1151" DOCKER_GPU_BUILD_ARG="--build-arg ROCM_VERSION=${GPU_ARCH_VERSION} --build-arg PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH} --build-arg DEVTOOLSET_VERSION=${DEVTOOLSET_VERSION}" ;; manylinux2_28-builder:xpu) diff --git a/.ci/docker/requirements-docs.txt b/.ci/docker/requirements-docs.txt index 6e623b4c56949..de71919012e13 100644 --- a/.ci/docker/requirements-docs.txt +++ b/.ci/docker/requirements-docs.txt @@ -1,15 +1,11 @@ -sphinx==5.3.0 +sphinx==7.2.6 #Description: This is used to generate PyTorch docs -#Pinned versions: 5.3.0 +#Pinned versions: 7.2.6 -standard-imghdr==3.13.0; python_version >= "3.13" -#Description: This is needed by Sphinx, so it needs to be added here. -# The reasons are as follows: -# 1) This module has been removed from the Python standard library since Python 3.13(https://peps.python.org/pep-0594/#imghdr); -# 2) The current version of Sphinx (5.3.0) is not compatible with Python 3.13. -# Once Sphinx is upgraded to a version compatible with Python 3.13 or later, we can remove this dependency. +pytorch_sphinx_theme2==0.2.0 +#Description: This is needed to generate PyTorch docs +#Pinned versions: 0.2.0 --e git+https://github.com/pytorch/pytorch_sphinx_theme.git@71e55749be14ceb56e7f8211a9fb649866b87ad4#egg=pytorch_sphinx_theme2 # TODO: sphinxcontrib.katex 0.9.0 adds a local KaTeX server to speed up pre-rendering # but it doesn't seem to work and hangs around idly. The initial thought that it is probably # something related to Docker setup. We can investigate this later. @@ -36,17 +32,17 @@ tensorboard==2.18.0 ; python_version >= "3.13" #Description: This is used to generate PyTorch docs #Pinned versions: 2.13.0 -breathe==4.34.0 +breathe==4.36.0 #Description: This is used to generate PyTorch C++ docs -#Pinned versions: 4.34.0 +#Pinned versions: 4.36.0 -exhale==0.2.3 +exhale==0.3.7 #Description: This is used to generate PyTorch C++ docs -#Pinned versions: 0.2.3 +#Pinned versions: 0.3.7 -docutils==0.16 +docutils==0.20 #Description: This is used to generate PyTorch C++ docs -#Pinned versions: 0.16 +#Pinned versions: 0.20 bs4==0.0.1 #Description: This is used to generate PyTorch C++ docs @@ -56,13 +52,13 @@ IPython==8.12.0 #Description: This is used to generate PyTorch functorch docs #Pinned versions: 8.12.0 -myst-nb==0.17.2 +myst-nb==1.3.0 #Description: This is used to generate PyTorch functorch and torch.compile docs. -#Pinned versions: 0.17.2 +#Pinned versions: 1.3.0 # The following are required to build torch.distributed.elastic.rendezvous.etcd* docs python-etcd==0.4.5 sphinx-copybutton==0.5.0 -sphinx-design==0.4.0 +sphinx-design==0.6.1 sphinxcontrib-mermaid==1.0.0 -myst-parser==0.18.1 +myst-parser==4.0.1 diff --git a/.ci/docker/ubuntu/Dockerfile b/.ci/docker/ubuntu/Dockerfile index 84a74114c381e..2081dcbdffd17 100644 --- a/.ci/docker/ubuntu/Dockerfile +++ b/.ci/docker/ubuntu/Dockerfile @@ -143,6 +143,15 @@ COPY ci_commit_pins/halide.txt halide.txt RUN if [ -n "${HALIDE}" ]; then bash ./install_halide.sh; fi RUN rm install_halide.sh common_utils.sh halide.txt +ARG PALLAS +ARG CUDA_VERSION +# Install JAX with CUDA support (for Pallas) +COPY ./common/install_jax.sh install_jax.sh +COPY ./common/common_utils.sh common_utils.sh +COPY ./ci_commit_pins/jax.txt /ci_commit_pins/jax.txt +RUN if [ -n "${PALLAS}" ]; then bash ./install_jax.sh ${CUDA_VERSION}; fi +RUN rm -f install_jax.sh common_utils.sh /ci_commit_pins/jax.txt + ARG ONNX # Install ONNX dependencies COPY ./common/install_onnx.sh ./common/common_utils.sh ./ diff --git a/.ci/lumen_cli/cli/lib/common/cli_helper.py b/.ci/lumen_cli/cli/lib/common/cli_helper.py index 927ca09fe7230..4086eb7d46e81 100644 --- a/.ci/lumen_cli/cli/lib/common/cli_helper.py +++ b/.ci/lumen_cli/cli/lib/common/cli_helper.py @@ -8,9 +8,11 @@ try: - from typing import Any, Callable, Required, TypedDict # Python 3.11+ + from collections.abc import Callable # Python 3.11+ + from typing import Any, Required, TypedDict except ImportError: - from typing import Any, Callable, TypedDict + from collections.abc import Callable + from typing import Any, TypedDict from typing_extensions import Required # Fallback for Python <3.11 diff --git a/.ci/magma-rocm/README.md b/.ci/magma-rocm/README.md index cfc3cd3ab1632..3fe1e5d976ccd 100644 --- a/.ci/magma-rocm/README.md +++ b/.ci/magma-rocm/README.md @@ -30,7 +30,6 @@ into a tarball, with the following structure: More specifically, `build_magma.sh` copies over the relevant files from the `package_files` directory depending on the ROCm version. Outputted binaries should be in the `output` folder. - ## Pushing Packages can be uploaded to an S3 bucket using: diff --git a/.ci/magma-rocm/build_magma.sh b/.ci/magma-rocm/build_magma.sh index 7d95fed873dc0..c7c7780227ea5 100755 --- a/.ci/magma-rocm/build_magma.sh +++ b/.ci/magma-rocm/build_magma.sh @@ -6,8 +6,8 @@ set -eou pipefail # The script expects DESIRED_CUDA and PACKAGE_NAME to be set ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" -# post merge of https://github.com/icl-utk-edu/magma/pull/65 -MAGMA_VERSION=c0792ae825fb36872784892ea643dd6f3456bc5f +# https://github.com/icl-utk-edu/magma/pull/65 +MAGMA_VERSION=d6e4117bc88e73f06d26c6c2e14f064e8fc3d1ec # Folders for the build PACKAGE_FILES=${ROOT_DIR}/magma-rocm/package_files # metadata @@ -20,7 +20,7 @@ mkdir -p ${PACKAGE_DIR} ${PACKAGE_OUTPUT}/linux-64 ${PACKAGE_BUILD} ${PACKAGE_RE # Fetch magma sources and verify checksum pushd ${PACKAGE_DIR} -git clone https://github.com/icl-utk-edu/magma +git clone https://github.com/jeffdaily/magma pushd magma git checkout ${MAGMA_VERSION} popd diff --git a/.ci/onnx/common.sh b/.ci/onnx/common.sh index 3de5836a02858..b8f912fbbb4e6 100644 --- a/.ci/onnx/common.sh +++ b/.ci/onnx/common.sh @@ -21,3 +21,87 @@ if [[ "${BUILD_ENVIRONMENT}" == *rocm* ]]; then fi mkdir -p "$pytest_reports_dir" || true + +########################################## +# copied from .ci/pytorch/common_utils.sh +########################################## + +function get_pinned_commit() { + cat .github/ci_commit_pins/"${1}".txt +} + +function pip_install_whl() { + # This is used to install PyTorch and other build artifacts wheel locally + # without using any network connection + + # Convert the input arguments into an array + local args=("$@") + + # Check if the first argument contains multiple paths separated by spaces + if [[ "${args[0]}" == *" "* ]]; then + # Split the string by spaces into an array + IFS=' ' read -r -a paths <<< "${args[0]}" + # Loop through each path and install individually + for path in "${paths[@]}"; do + echo "Installing $path" + python3 -mpip install --no-index --no-deps "$path" + done + else + # Loop through each argument and install individually + for path in "${args[@]}"; do + echo "Installing $path" + python3 -mpip install --no-index --no-deps "$path" + done + fi +} + +function pip_build_and_install() { + local build_target=$1 + local wheel_dir=$2 + + local found_whl=0 + for file in "${wheel_dir}"/*.whl + do + if [[ -f "${file}" ]]; then + found_whl=1 + break + fi + done + + # Build the wheel if it doesn't exist + if [ "${found_whl}" == "0" ]; then + python3 -m pip wheel \ + --no-build-isolation \ + --no-deps \ + -w "${wheel_dir}" \ + "${build_target}" + fi + + for file in "${wheel_dir}"/*.whl + do + pip_install_whl "${file}" + done +} + +function install_torchvision() { + local orig_preload + local commit + commit=$(get_pinned_commit vision) + orig_preload=${LD_PRELOAD} + if [ -n "${LD_PRELOAD}" ]; then + # Silence dlerror to work-around glibc ASAN bug, see https://sourceware.org/bugzilla/show_bug.cgi?id=27653#c9 + echo 'char* dlerror(void) { return "";}'|gcc -fpic -shared -o "${HOME}/dlerror.so" -x c - + LD_PRELOAD=${orig_preload}:${HOME}/dlerror.so + fi + + if [[ "${BUILD_ENVIRONMENT}" == *cuda* ]]; then + # Not sure if both are needed, but why not + export FORCE_CUDA=1 + export WITH_CUDA=1 + fi + pip_build_and_install "git+https://github.com/pytorch/vision.git@${commit}" dist/vision + + if [ -n "${LD_PRELOAD}" ]; then + LD_PRELOAD=${orig_preload} + fi +} diff --git a/.ci/onnx/test.sh b/.ci/onnx/test.sh index d42ca2c218dec..1f2a23b49dc45 100755 --- a/.ci/onnx/test.sh +++ b/.ci/onnx/test.sh @@ -19,7 +19,7 @@ git config --global --add safe.directory /var/lib/jenkins/workspace if [[ "$BUILD_ENVIRONMENT" == *onnx* ]]; then # TODO: This can be removed later once vision is also part of the Docker image - pip install -q --no-use-pep517 "git+https://github.com/pytorch/vision.git@$(cat .github/ci_commit_pins/vision.txt)" + install_torchvision # JIT C++ extensions require ninja, so put it into PATH. export PATH="/var/lib/jenkins/.local/bin:$PATH" # NB: ONNX test is fast (~15m) so it's ok to retry it few more times to avoid any flaky issue, we diff --git a/.ci/pytorch/build.sh b/.ci/pytorch/build.sh index d66aa1120fb30..071f14700def4 100755 --- a/.ci/pytorch/build.sh +++ b/.ci/pytorch/build.sh @@ -168,14 +168,16 @@ if [[ "$BUILD_ENVIRONMENT" == *xpu* ]]; then # shellcheck disable=SC1091 source /opt/intel/oneapi/compiler/latest/env/vars.sh # shellcheck disable=SC1091 + source /opt/intel/oneapi/umf/latest/env/vars.sh + # shellcheck disable=SC1091 source /opt/intel/oneapi/ccl/latest/env/vars.sh # shellcheck disable=SC1091 source /opt/intel/oneapi/mpi/latest/env/vars.sh + # shellcheck disable=SC1091 + source /opt/intel/oneapi/pti/latest/env/vars.sh # Enable XCCL build export USE_XCCL=1 export USE_MPI=0 - # XPU kineto feature dependencies are not fully ready, disable kineto build as temp WA - export USE_KINETO=0 export TORCH_XPU_ARCH_LIST=pvc fi diff --git a/.ci/pytorch/common_utils.sh b/.ci/pytorch/common_utils.sh index 9c9d223777466..323ac6cacd889 100644 --- a/.ci/pytorch/common_utils.sh +++ b/.ci/pytorch/common_utils.sh @@ -96,7 +96,6 @@ function pip_build_and_install() { python3 -m pip wheel \ --no-build-isolation \ --no-deps \ - --no-use-pep517 \ -w "${wheel_dir}" \ "${build_target}" fi @@ -308,6 +307,28 @@ function install_torchao() { pip_build_and_install "git+https://github.com/pytorch/ao.git@${commit}" dist/ao } +function install_flash_attn_cute() { + echo "Installing FlashAttention CuTe from GitHub..." + # Grab latest main til we have a pinned commit + local flash_attn_commit + flash_attn_commit=$(git ls-remote https://github.com/Dao-AILab/flash-attention.git HEAD | cut -f1) + + # Clone the repo to a temporary directory + rm -rf flash-attention-build + git clone --depth 1 --recursive https://github.com/Dao-AILab/flash-attention.git flash-attention-build + + pushd flash-attention-build + git checkout "${flash_attn_commit}" + + # Install only the 'cute' sub-directory + pip_install -e flash_attn/cute/ + popd + + # remove the local repo + rm -rf flash-attention-build + echo "FlashAttention CuTe installation complete." +} + function print_sccache_stats() { echo 'PyTorch Build Statistics' sccache --show-stats diff --git a/.ci/pytorch/python_doc_push_script.sh b/.ci/pytorch/python_doc_push_script.sh index ec1187b3fe4c4..6bcd46c4815a6 100755 --- a/.ci/pytorch/python_doc_push_script.sh +++ b/.ci/pytorch/python_doc_push_script.sh @@ -89,23 +89,41 @@ if [ "$is_main_doc" = true ]; then make coverage # Now we have the coverage report, we need to make sure it is empty. - # Count the number of lines in the file and turn that number into a variable - # $lines. The `cut -f1 ...` is to only parse the number, not the filename - # Skip the report header by subtracting 2: the header will be output even if - # there are no undocumented items. + # Sphinx 7.2.6+ format: python.txt contains a statistics table with a TOTAL row + # showing the undocumented count in the third column. + # Example: | TOTAL | 99.83% | 2 | # # Also: see docs/source/conf.py for "coverage_ignore*" items, which should # be documented then removed from there. - lines=$(wc -l build/coverage/python.txt 2>/dev/null |cut -f1 -d' ') - undocumented=$((lines - 2)) - if [ $undocumented -lt 0 ]; then + + # Extract undocumented count from TOTAL row in Sphinx 7.2.6 statistics table + # The table format is: | Module | Coverage | Undocumented | + # Extract the third column (undocumented count) from the TOTAL row + undocumented=$(grep "| TOTAL" build/coverage/python.txt | awk -F'|' '{print $4}' | tr -d ' ') + + if [ -z "$undocumented" ] || ! [[ "$undocumented" =~ ^[0-9]+$ ]]; then echo coverage output not found exit 1 - elif [ $undocumented -gt 0 ]; then - echo undocumented objects found: - cat build/coverage/python.txt + elif [ "$undocumented" -gt 0 ]; then + set +x # Disable command echoing for cleaner output + echo "" + echo "=====================" + echo "UNDOCUMENTED OBJECTS:" + echo "=====================" + echo "" + # Find the line number of the TOTAL row and print only what comes after it + total_line=$(grep -n "| TOTAL" build/coverage/python.txt | cut -d: -f1) + if [ -n "$total_line" ]; then + # Print only the detailed list (skip the statistics table) + tail -n +$((total_line + 2)) build/coverage/python.txt + else + # Fallback to showing entire file if TOTAL line not found + cat build/coverage/python.txt + fi + echo "" echo "Make sure you've updated relevant .rsts in docs/source!" - echo "You can reproduce locally by running 'cd docs && make coverage && cat build/coverage/python.txt'" + echo "You can reproduce locally by running 'cd docs && make coverage && tail -n +\$((grep -n \"| TOTAL\" build/coverage/python.txt | cut -d: -f1) + 2)) build/coverage/python.txt'" + set -x # Re-enable command echoing exit 1 fi else diff --git a/.ci/pytorch/smoke_test/smoke_test.py b/.ci/pytorch/smoke_test/smoke_test.py index 675d58a3e283d..e760340bebb12 100644 --- a/.ci/pytorch/smoke_test/smoke_test.py +++ b/.ci/pytorch/smoke_test/smoke_test.py @@ -353,6 +353,17 @@ def test_linalg(device="cpu") -> None: torch.linalg.svd(A) +def test_sdpa(device="cpu", dtype=torch.float16) -> None: + """Regression test for https://github.com/pytorch/pytorch/issues/167602 + Without nvrtc_builtins on CuDNN-9.13 on CUDA-13 fails with ` No valid execution plans built.` + """ + print(f"Testing SDPA on {device} using type {dtype}") + k, q, v = torch.rand(3, 1, 16, 77, 64, dtype=dtype, device=device).unbind(0) + attn = torch.rand(1, 1, 77, 77, dtype=dtype, device=device) + rc = torch.nn.functional.scaled_dot_product_attention(q, k, v, attn) + assert rc.isnan().any().item() is False + + def smoke_test_compile(device: str = "cpu") -> None: supported_dtypes = [torch.float16, torch.float32, torch.float64] @@ -489,10 +500,12 @@ def main() -> None: smoke_test_conv2d() test_linalg() test_numpy() + test_sdpa() if is_cuda_system: test_linalg("cuda") test_cuda_gds_errors_captured() + test_sdpa("cuda") if options.package == "all": smoke_test_modules() diff --git a/.ci/pytorch/test.sh b/.ci/pytorch/test.sh index 9ae2578758939..01075259e9fe9 100755 --- a/.ci/pytorch/test.sh +++ b/.ci/pytorch/test.sh @@ -208,6 +208,8 @@ if [[ "$BUILD_ENVIRONMENT" == *xpu* ]]; then source /opt/intel/oneapi/ccl/latest/env/vars.sh # shellcheck disable=SC1091 source /opt/intel/oneapi/mpi/latest/env/vars.sh + # shellcheck disable=SC1091 + source /opt/intel/oneapi/pti/latest/env/vars.sh # Check XPU status before testing timeout 30 xpu-smi discovery || true fi @@ -342,8 +344,18 @@ test_python_smoke() { } test_python_smoke_b200() { - # Targeted smoke tests for B200 - staged approach to avoid too many failures - time python test/run_test.py --include test_matmul_cuda test_scaled_matmul_cuda inductor/test_fp8 $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running + # Targeted smoke tests for B200 including FlashAttention CuTe coverage + install_flash_attn_cute + time python test/run_test.py \ + --include \ + test_matmul_cuda \ + test_scaled_matmul_cuda \ + inductor/test_fp8 \ + nn/attention/test_fa4 \ + nn/attention/test_open_registry \ + inductor/test_flex_flash \ + $PYTHON_TEST_EXTRA_OPTION \ + --upload-artifacts-while-running assert_git_not_dirty } @@ -377,6 +389,13 @@ test_lazy_tensor_meta_reference_disabled() { export -n TORCH_DISABLE_FUNCTIONALIZATION_META_REFERENCE } +test_dynamo_core() { + time python test/run_test.py \ + --include-dynamo-core-tests \ + --verbose \ + --upload-artifacts-while-running + assert_git_not_dirty +} test_dynamo_wrapped_shard() { if [[ -z "$NUM_TEST_SHARDS" ]]; then @@ -824,6 +843,11 @@ test_inductor_halide() { assert_git_not_dirty } +test_inductor_pallas() { + python test/run_test.py --include inductor/test_pallas.py --verbose + assert_git_not_dirty +} + test_inductor_triton_cpu() { python test/run_test.py --include inductor/test_triton_cpu_backend.py inductor/test_torchinductor_strided_blocks.py --verbose assert_git_not_dirty @@ -1226,6 +1250,97 @@ test_custom_script_ops() { assert_git_not_dirty } +test_libtorch_agnostic_targetting() { + echo "Testing libtorch_agnostic runs correctly on TORCH_TARGET_VERSION" + + REPO_DIR=$(pwd) + WHEEL_DIR="${REPO_DIR}/test/cpp_extensions/.wheels" + + # Build wheel with current PyTorch (this has TORCH_TARGET_VERSION 2_9_0) + echo "Building 2.9 extension wheel with current PyTorch..." + pushd test/cpp_extensions/libtorch_agnostic_2_9_extension + time python setup.py bdist_wheel + + # Save the wheel + mkdir -p "$WHEEL_DIR" + cp dist/*.whl "$WHEEL_DIR/" + WHEEL_FILE=$(find "$WHEEL_DIR" -maxdepth 1 -name "*.whl" -type f | head -1) + echo "Built wheel: $(basename "$WHEEL_FILE")" + popd + + # Create venv and install PyTorch 2.9 + python -m venv venv_pytorch_2_9 + # shellcheck disable=SC1091 + . venv_pytorch_2_9/bin/activate + + # Clear PYTHONPATH to avoid using the development PyTorch + echo "Clearing PYTHONPATH to use only venv packages..." + unset PYTHONPATH + + # Upgrade pip to latest version + echo "Upgrading pip to latest version..." + pip install --upgrade pip + pip --version + + echo "Installing PyTorch 2.9..." + + # Install from release channel only + PYTORCH_VERSION="2.9.0" + + # Extract CUDA version from BUILD_ENVIRONMENT (e.g., "cuda12.1" -> "cu121") + if [[ "$BUILD_ENVIRONMENT" =~ cuda([0-9]+)\.([0-9]+) ]]; then + CUDA_MAJOR="${BASH_REMATCH[1]}" + CUDA_MINOR="${BASH_REMATCH[2]}" + CUDA_VERSION="cu${CUDA_MAJOR}${CUDA_MINOR}" + echo " Detected CUDA ${CUDA_MAJOR}.${CUDA_MINOR} from BUILD_ENVIRONMENT, using ${CUDA_VERSION}" + else + # Default to CPU build + CUDA_VERSION="cpu" + echo " No CUDA detected in BUILD_ENVIRONMENT, using CPU build" + fi + + if pip install torch=="${PYTORCH_VERSION}" --index-url https://download.pytorch.org/whl/${CUDA_VERSION}/; then + echo "Installed PyTorch ${PYTORCH_VERSION} from release channel (${CUDA_VERSION})" + else + echo " FAILED to install PyTorch 2.9.0 from release channel" + echo " URL: https://download.pytorch.org/whl/${CUDA_VERSION}/" + deactivate + rm -rf venv_pytorch_2_9 + return 1 + fi + + INSTALLED_VERSION=$(python -c "import torch; print(torch.__version__)" 2>/dev/null || echo "unknown") + echo " Installed version: $INSTALLED_VERSION" + + # Install test dependencies + echo "Installing test dependencies..." + pip install expecttest numpy unittest-xml-reporting + + # Install the pre-built wheel + echo "" + echo "Installing pre-built 2.9 extension wheel (built with PyTorch 2.10)..." + pip install "$WHEEL_FILE" + echo "Installed $(basename "$WHEEL_FILE") into PyTorch 2.9 environment" + + # Run tests with PyTorch 2.9 runtime (2.10 tests will be skipped automatically) + echo "" + echo "Running tests with PyTorch 2.9 runtime (using wheel built on PyTorch 2.10)..." + if time python test/cpp_extensions/test_libtorch_agnostic.py -v; then + echo "" + echo " Wheel built with current torch and TORCH_TARGET_VERSION 2_9_0 works with PyTorch 2.9 runtime!" + else + echo "targeting test failed" + deactivate + rm -rf venv_pytorch_2_9 "$WHEEL_DIR" + return 1 + fi + + deactivate + rm -rf venv_pytorch_2_9 "$WHEEL_DIR" + + assert_git_not_dirty +} + test_jit_hooks() { echo "Testing jit hooks in cpp" HOOK_BUILD="${CUSTOM_TEST_ARTIFACT_BUILD_DIR}/jit-hook-build" @@ -1663,6 +1778,22 @@ test_operator_microbenchmark() { done } +test_attention_microbenchmark() { + TEST_REPORTS_DIR=$(pwd)/test/test-reports + mkdir -p "$TEST_REPORTS_DIR" + TEST_DIR=$(pwd) + + # Install attention-gym dependency + echo "Installing attention-gym..." + python -m pip install git+https://github.com/meta-pytorch/attention-gym.git@main + pip show triton + + cd "${TEST_DIR}"/benchmarks/transformer + + $TASKSET python score_mod.py --config configs/config_basic.yaml \ + --output-json-for-dashboard "${TEST_REPORTS_DIR}/attention_microbenchmark.json" +} + if ! [[ "${BUILD_ENVIRONMENT}" == *libtorch* || "${BUILD_ENVIRONMENT}" == *-bazel-* ]]; then (cd test && python -c "import torch; print(torch.__config__.show())") (cd test && python -c "import torch; print(torch.__config__.parallel_info())") @@ -1682,6 +1813,8 @@ elif [[ "${BUILD_ENVIRONMENT}" == *aarch64* && "${TEST_CONFIG}" == 'default' ]]; elif [[ "${TEST_CONFIG}" == *backward* ]]; then test_forward_backward_compatibility # Do NOT add tests after bc check tests, see its comment. +elif [[ "${TEST_CONFIG}" == *libtorch_agnostic_targetting* ]]; then + test_libtorch_agnostic_targetting elif [[ "${TEST_CONFIG}" == *xla* ]]; then install_torchvision build_xla @@ -1720,10 +1853,14 @@ elif [[ "${TEST_CONFIG}" == *operator_benchmark* ]]; then fi elif [[ "${TEST_CONFIG}" == *operator_microbenchmark* ]]; then test_operator_microbenchmark +elif [[ "${TEST_CONFIG}" == *attention_microbenchmark* ]]; then + test_attention_microbenchmark elif [[ "${TEST_CONFIG}" == *inductor_distributed* ]]; then test_inductor_distributed elif [[ "${TEST_CONFIG}" == *inductor-halide* ]]; then test_inductor_halide +elif [[ "${TEST_CONFIG}" == *inductor-pallas* ]]; then + test_inductor_pallas elif [[ "${TEST_CONFIG}" == *inductor-triton-cpu* ]]; then test_inductor_triton_cpu elif [[ "${TEST_CONFIG}" == *inductor-micro-benchmark* ]]; then @@ -1777,6 +1914,8 @@ elif [[ "${TEST_CONFIG}" == *inductor* ]]; then test_inductor_shard "${SHARD_NUMBER}" elif [[ "${TEST_CONFIG}" == *einops* ]]; then test_einops +elif [[ "${TEST_CONFIG}" == *dynamo_core* ]]; then + test_dynamo_core elif [[ "${TEST_CONFIG}" == *dynamo_wrapped* ]]; then install_torchvision test_dynamo_wrapped_shard "${SHARD_NUMBER}" diff --git a/.ci/pytorch/win-test-helpers/arm64/build_pytorch.ps1 b/.ci/pytorch/win-test-helpers/arm64/build_pytorch.ps1 index a165f2a222caf..f0eabed170d25 100644 --- a/.ci/pytorch/win-test-helpers/arm64/build_pytorch.ps1 +++ b/.ci/pytorch/win-test-helpers/arm64/build_pytorch.ps1 @@ -70,7 +70,7 @@ sccache --zero-stats sccache --show-stats # Build the wheel -python -m build --wheel --no-build-isolation +python -m build --wheel --no-isolation if ($LASTEXITCODE -ne 0) { exit 1 } # Install the wheel locally diff --git a/.github/ISSUE_TEMPLATE/release-feature-request.yml b/.github/ISSUE_TEMPLATE/release-feature-request.yml index 80f10807ae56b..090a41d1942f6 100644 --- a/.github/ISSUE_TEMPLATE/release-feature-request.yml +++ b/.github/ISSUE_TEMPLATE/release-feature-request.yml @@ -1,11 +1,11 @@ -name: 🚀 Release highlight for proposed Feature +name: 🚀 New Feature for Release description: Submit a Release highlight for proposed Feature labels: ["release-feature-request"] body: - type: textarea attributes: - label: Release highlight for proposed Feature + label: New Feature for Release description: > Example: “A torch.special module, analogous to SciPy's special module.” - type: input diff --git a/.github/actionlint.yaml b/.github/actionlint.yaml index d021371ca8863..dfb30e155b162 100644 --- a/.github/actionlint.yaml +++ b/.github/actionlint.yaml @@ -63,7 +63,7 @@ self-hosted-runner: - linux.rocm.gpu.gfx942.1 - linux.rocm.gpu.gfx942.2 - linux.rocm.gpu.gfx942.4 - - rocm-docker + - linux.rocm.gfx942.docker-cache # Org wise AWS `mac2.metal` runners (2020 Mac mini hardware powered by Apple silicon M1 processors) - macos-m1-stable - macos-m1-14 diff --git a/.github/actions/pytest-cache-download/action.yml b/.github/actions/pytest-cache-download/action.yml index 1406f962c4ca8..3f51f6a5525bc 100644 --- a/.github/actions/pytest-cache-download/action.yml +++ b/.github/actions/pytest-cache-download/action.yml @@ -38,9 +38,9 @@ runs: run: | python3 .github/scripts/pytest_cache.py \ --download \ - --cache_dir $GITHUB_WORKSPACE/$CACHE_DIR \ - --pr_identifier $GITHUB_REF \ - --job_identifier $JOB_IDENTIFIER \ - --temp_dir $RUNNER_TEMP \ - --repo $REPO \ - --bucket $BUCKET \ + --cache_dir "$GITHUB_WORKSPACE/$CACHE_DIR" \ + --pr_identifier "$GITHUB_REF" \ + --job_identifier "$JOB_IDENTIFIER" \ + --temp_dir "$RUNNER_TEMP" \ + --repo "$REPO" \ + --bucket "$BUCKET" \ diff --git a/.github/actions/pytest-cache-upload/action.yml b/.github/actions/pytest-cache-upload/action.yml index 2652d019075f7..9fbb63a760f27 100644 --- a/.github/actions/pytest-cache-upload/action.yml +++ b/.github/actions/pytest-cache-upload/action.yml @@ -47,11 +47,11 @@ runs: run: | python3 .github/scripts/pytest_cache.py \ --upload \ - --cache_dir $GITHUB_WORKSPACE/$CACHE_DIR \ - --pr_identifier $GITHUB_REF \ - --job_identifier $JOB_IDENTIFIER \ - --sha $SHA \ - --test_config $TEST_CONFIG \ - --shard $SHARD \ - --repo $REPO \ - --temp_dir $RUNNER_TEMP \ + --cache_dir "$GITHUB_WORKSPACE/$CACHE_DIR" \ + --pr_identifier "$GITHUB_REF" \ + --job_identifier "$JOB_IDENTIFIER" \ + --sha "$SHA" \ + --test_config "$TEST_CONFIG" \ + --shard "$SHARD" \ + --repo "$REPO" \ + --temp_dir "$RUNNER_TEMP" \ diff --git a/.github/ci_commit_pins/audio.txt b/.github/ci_commit_pins/audio.txt index 966f6bcfc0d94..616dfd88ce812 100644 --- a/.github/ci_commit_pins/audio.txt +++ b/.github/ci_commit_pins/audio.txt @@ -1 +1 @@ -3b0e7a6f192ca2715e7e6cbe5db007aea7165fe2 +ee1a1350eb37804b94334768f328144f058f14e9 diff --git a/.github/ci_commit_pins/vision.txt b/.github/ci_commit_pins/vision.txt index 183e9fb4b06e1..64ee992f566b7 100644 --- a/.github/ci_commit_pins/vision.txt +++ b/.github/ci_commit_pins/vision.txt @@ -1 +1 @@ -cfbc5c2f1c798991715a6b06bb3ce46478c4487c +2d82dc5caa336d179d9b46ac4a0fb8c43d84c5cc diff --git a/.github/ci_commit_pins/xla.txt b/.github/ci_commit_pins/xla.txt index 01f0673fcf802..803ba72d9ac92 100644 --- a/.github/ci_commit_pins/xla.txt +++ b/.github/ci_commit_pins/xla.txt @@ -1 +1 @@ -c8b09f5f77d6bf6fb7ed7a9aa83e5d8156b3a5e9 +94631807d22c09723dd006f7be5beb649d5f88d0 diff --git a/.github/copilot-instructions.md b/.github/copilot-instructions.md new file mode 100644 index 0000000000000..06c3f32abd5e1 --- /dev/null +++ b/.github/copilot-instructions.md @@ -0,0 +1,125 @@ +# PyTorch Copilot Instructions + +This is the PyTorch machine learning framework codebase. These instructions help AI agents navigate and contribute effectively. + +## Architecture Overview + +### Core Components + +- **c10/** - Core library (C++-10 compatible) for essential, binary-size-conscious functionality +- **aten/** - ATen tensor library (C++), PyTorch's foundation without autograd + - `aten/src/ATen/native/` - Modern operator implementations (CPU/CUDA/MPS/sparse) + - `aten/src/ATen/native/native_functions.yaml` - **Critical**: Declarative operator registry +- **torch/** - Python bindings and public API + - `torch/csrc/` - C++ Python bindings (hand-written and generated) + - `torch/csrc/autograd/` - Reverse-mode automatic differentiation + - `torch/csrc/jit/` - TorchScript JIT compiler +- **torchgen/** - Code generation tooling that reads `native_functions.yaml` +- **tools/** - Build scripts, autograd derivatives, code generation + +### The Code Generation Workflow + +**Most operator changes require editing `native_functions.yaml`**, not direct C++ files. This YAML file: +1. Declares operator signatures, variants (function/method), and dispatch behavior +2. Gets processed by `torchgen/` to generate C++/Python bindings +3. Produces headers in `build/aten/src/ATen/` during compilation + +Example entry structure: +```yaml +- func: my_op(Tensor self, Scalar alpha=1) -> Tensor + variants: function, method + dispatch: + CPU: my_op_cpu + CUDA: my_op_cuda +``` + +After editing `native_functions.yaml`, implement kernels in `aten/src/ATen/native/` (see `aten/src/ATen/native/README.md`). + +## Development Workflows + +### Building from Source + +**Never run `setup.py` directly** - use pip with editable install: +```bash +python -m pip install --no-build-isolation -v -e . +``` + +Speed up builds: +- `DEBUG=1` - Debug symbols with `-g -O0` +- `USE_CUDA=0` - Skip CUDA compilation +- `BUILD_TEST=0` - Skip C++ test binaries +- Install `ninja` (`pip install ninja`) for faster builds +- Use `ccache` for incremental compilation caching + +Rebuild specific targets: `(cd build && ninja )` + +### Testing + +**Critical**: DO NOT run entire test suites. Run specific tests only: +```bash +python test/test_torch.py TestTorch.test_specific_case +``` + +**Test structure**: All tests use `torch.testing._internal.common_utils`: +```python +from torch.testing._internal.common_utils import run_tests, TestCase + +class TestFeature(TestCase): + def test_something(self): + # Use self.assertEqual for tensor comparisons + pass + +if __name__ == "__main__": + run_tests() +``` + +**For bug fixes**: Create a standalone reproduction script first, verify it fails, then fix and add to appropriate test file. + +### Linting + +Run linter (not pre-commit): `lintrunner -a` (auto-applies fixes) + +## Project-Specific Conventions + +### Memory and Storage +- **Storage is never nullptr** (but `StorageImpl.data` may be nullptr for unallocated outputs) +- CUDA device info lives in storage objects + +### Python-C++ Integration (`torch/csrc/`) +- Always include `Python.h` **first** to avoid `_XOPEN_SOURCE` redefinition errors +- Use `pybind11::gil_scoped_acquire` before calling Python API or using `THPObjectPtr` +- Wrap entry points with `HANDLE_TH_ERRORS` / `END_HANDLE_TH_ERRORS` for exception conversion + +### Dispatch System +- PyTorch uses operator dispatch to route calls to backend-specific kernels +- Prefer `CompositeExplicitAutograd` dispatch when writing device-agnostic compound ops +- See `aten/src/ATen/native/README.md` for dispatch keyword guidance + +## Git Workflow (AI Agent Specific) + +When preparing PRs from this environment: +```bash +git stash -u +git reset --hard $(cat /tmp/orig_work.txt) # Reset to LOCAL branch +git stash pop +# Resolve conflicts if necessary +``` + +## Common Gotchas + +1. **Editing generated files** - If it's in `build/`, don't edit it. Edit the source template or `native_functions.yaml` +2. **NVCC template compilation** - NVCC is stricter about C++ than gcc/clang; code working on Linux may fail Windows CI +3. **Windows symbol visibility** - Use `TORCH_API` macros for exported symbols (required on Windows, optional on Linux) +4. **No internet access** - DO NOT attempt to install dependencies during development + +## Key Files Reference + +- `AGENTS.md` - Instructions specific to AI coding agents +- `CONTRIBUTING.md` - Comprehensive human contributor guide +- `GLOSSARY.md` - Terminology (ATen, kernels, operations, JIT, TorchScript) +- `aten/src/ATen/native/README.md` - Operator implementation guide +- `tools/autograd/derivatives.yaml` - Gradient definitions for autograd + +## Performance Debugging + +Use `TORCH_SHOW_CPP_STACKTRACES=1` for C++ traces in Python errors. For profiling, prefer `py-spy` over manual instrumentation. diff --git a/.github/labeler.yml b/.github/labeler.yml index 7b47b9fefb5dc..cd3b9c809039e 100644 --- a/.github/labeler.yml +++ b/.github/labeler.yml @@ -91,13 +91,6 @@ "ciflow/trunk": - .ci/docker/ci_commit_pins/triton.txt -"oncall: distributed": -- torch/csrc/distributed/** -- torch/distributed/** -- torch/nn/parallel/** -- test/distributed/** -- torch/testing/_internal/distributed/** - "release notes: distributed (checkpoint)": - torch/distributed/checkpoint/** - test/distributed/checkpoint/** @@ -138,7 +131,8 @@ - test/test_matmul_cuda.py - test/test_scaled_matmul_cuda.py - test/inductor/test_fp8.py -- aten/src/ATen/native/cuda/Blas.cpp +- aten/src/ATen/native/cuda/*Blas.cpp +- aten/src/ATen/cuda/CUDA*Blas.* - torch/**/*cublas* - torch/_inductor/kernel/mm.py - test/inductor/test_max_autotune.py @@ -148,7 +142,8 @@ - test/test_matmul_cuda.py - test/test_scaled_matmul_cuda.py - test/inductor/test_fp8.py -- aten/src/ATen/native/cuda/Blas.cpp +- aten/src/ATen/native/cuda/*Blas.cpp +- aten/src/ATen/cuda/CUDA*Blas.* - torch/**/*cublas* - torch/_inductor/kernel/mm.py - test/inductor/test_max_autotune.py @@ -158,7 +153,21 @@ - test/test_matmul_cuda.py - test/test_scaled_matmul_cuda.py - test/inductor/test_fp8.py -- aten/src/ATen/native/cuda/Blas.cpp +- aten/src/ATen/native/cuda/*Blas.cpp +- aten/src/ATen/cuda/CUDA*Blas.* - torch/_inductor/kernel/mm.py - test/inductor/test_max_autotune.py - third_party/fbgemm + +"ciflow/mps": +- aten/src/ATen/mps/** +- aten/src/ATen/native/mps/** +- torch/_inductor/codegen/mps.py +- test/test_mps.py +- test/inductor/test_mps_basic.py + +"ciflow/h100-symm-mem": +- torch/csrc/distributed/c10d/symm_mem/** +- torch/distributed/_symmetric_memory/** +- test/distributed/**/*mem* +- test/distributed/**/*mem*/** diff --git a/.github/nitpicks.yml b/.github/nitpicks.yml index 1d08a36abf1d5..e3fe5d4725587 100644 --- a/.github/nitpicks.yml +++ b/.github/nitpicks.yml @@ -10,3 +10,4 @@ pathFilter: - 'torch/csrc/inductor/aoti_torch/c/*' - 'torch/csrc/inductor/aoti_torch/generated/*' + - 'torch/csrc/stable/c/*' diff --git a/.github/pytorch-probot.yml b/.github/pytorch-probot.yml index c15ba606398f6..1258f4b8d8088 100644 --- a/.github/pytorch-probot.yml +++ b/.github/pytorch-probot.yml @@ -2,11 +2,12 @@ tracking_issue: 24422 ciflow_tracking_issue: 64124 ciflow_push_tags: - ciflow/b200 -- ciflow/b200-symm-mem - ciflow/b200-distributed +- ciflow/b200-symm-mem - ciflow/binaries - ciflow/binaries_libtorch - ciflow/binaries_wheel +- ciflow/dynamo - ciflow/h100 - ciflow/h100-cutlass-backend - ciflow/h100-distributed @@ -22,6 +23,8 @@ ciflow_push_tags: - ciflow/inductor-perf-test-nightly-xpu - ciflow/inductor-periodic - ciflow/inductor-rocm +- ciflow/inductor-rocm-mi200 +- ciflow/inductor-rocm-mi300 - ciflow/linux-aarch64 - ciflow/mps - ciflow/nightly @@ -33,11 +36,13 @@ ciflow_push_tags: - ciflow/quantization-periodic - ciflow/riscv64 - ciflow/rocm +- ciflow/rocm-mi200 - ciflow/rocm-mi300 - ciflow/rocm-mi355 - ciflow/rocm-navi31 - ciflow/s390 - ciflow/slow +- ciflow/slow-rocm-mi200 - ciflow/torchbench - ciflow/triton_binaries - ciflow/trunk diff --git a/.github/scripts/delete_old_branches.py b/.github/scripts/delete_old_branches.py index 8032008edf122..42cd851f8e338 100644 --- a/.github/scripts/delete_old_branches.py +++ b/.github/scripts/delete_old_branches.py @@ -1,10 +1,11 @@ # Delete old branches import os import re +from collections.abc import Callable from datetime import datetime from functools import lru_cache from pathlib import Path -from typing import Any, Callable +from typing import Any from github_utils import gh_fetch_json_dict, gh_graphql from gitutils import GitRepo diff --git a/.github/scripts/filter_test_configs.py b/.github/scripts/filter_test_configs.py index 592c7aab6d933..ee102d3f560f9 100755 --- a/.github/scripts/filter_test_configs.py +++ b/.github/scripts/filter_test_configs.py @@ -8,10 +8,11 @@ import subprocess import sys import warnings +from collections.abc import Callable from enum import Enum from functools import cache from logging import info -from typing import Any, Callable, Optional +from typing import Any, Optional from urllib.request import Request, urlopen import yaml diff --git a/.github/scripts/generate_pytorch_version.py b/.github/scripts/generate_pytorch_version.py index b35ccf6bcd38a..85be79c762e28 100755 --- a/.github/scripts/generate_pytorch_version.py +++ b/.github/scripts/generate_pytorch_version.py @@ -50,7 +50,7 @@ def get_tag() -> str: def get_base_version() -> str: root = get_pytorch_root() - dirty_version = open(root / "version.txt").read().strip() + dirty_version = Path(root / "version.txt").read_text().strip() # Strips trailing a0 from version.txt, not too sure why it's there in the # first place return re.sub(LEGACY_BASE_VERSION_SUFFIX_PATTERN, "", dirty_version) diff --git a/.github/scripts/get_workflow_job_id.py b/.github/scripts/get_workflow_job_id.py index b04cbed76e955..54e66621c9fd0 100644 --- a/.github/scripts/get_workflow_job_id.py +++ b/.github/scripts/get_workflow_job_id.py @@ -11,7 +11,8 @@ import time import urllib import urllib.parse -from typing import Any, Callable, Optional +from collections.abc import Callable +from typing import Any, Optional from urllib.request import Request, urlopen diff --git a/.github/scripts/github_utils.py b/.github/scripts/github_utils.py index 110015988a5c3..6479fb64ddbaf 100644 --- a/.github/scripts/github_utils.py +++ b/.github/scripts/github_utils.py @@ -3,8 +3,9 @@ import json import os import warnings +from collections.abc import Callable from dataclasses import dataclass -from typing import Any, Callable, cast, Optional, Union +from typing import Any, cast, Optional, Union from urllib.error import HTTPError from urllib.parse import quote from urllib.request import Request, urlopen diff --git a/.github/scripts/gitutils.py b/.github/scripts/gitutils.py index 3a90ddb5f4c6b..6e3bb3f209177 100644 --- a/.github/scripts/gitutils.py +++ b/.github/scripts/gitutils.py @@ -4,10 +4,10 @@ import re import tempfile from collections import defaultdict -from collections.abc import Iterator +from collections.abc import Callable, Iterator from datetime import datetime from functools import wraps -from typing import Any, Callable, cast, Optional, TypeVar, Union +from typing import Any, cast, Optional, TypeVar, Union T = TypeVar("T") diff --git a/.github/scripts/lintrunner.sh b/.github/scripts/lintrunner.sh index b353617a45b2b..58cda19cfeb43 100755 --- a/.github/scripts/lintrunner.sh +++ b/.github/scripts/lintrunner.sh @@ -34,6 +34,9 @@ python3 torch/utils/data/datapipes/gen_pyi.py # Also check generated pyi files find torch -name '*.pyi' -exec git add --force -- "{}" + +# Print current environment +python3 -m pip freeze + RC=0 # Run lintrunner on all files if ! lintrunner --force-color --tee-json=lint.json ${ADDITIONAL_LINTRUNNER_ARGS} 2> /dev/null; then diff --git a/.github/scripts/trymerge.py b/.github/scripts/trymerge.py index c258284a00d83..697ab6992793d 100755 --- a/.github/scripts/trymerge.py +++ b/.github/scripts/trymerge.py @@ -17,12 +17,12 @@ import time import urllib.parse from collections import defaultdict -from collections.abc import Iterable +from collections.abc import Callable, Iterable from dataclasses import dataclass from functools import cache from pathlib import Path from re import Pattern -from typing import Any, Callable, cast, NamedTuple, Optional +from typing import Any, cast, NamedTuple, Optional from warnings import warn import yaml diff --git a/.github/workflows/_linux-test.yml b/.github/workflows/_linux-test.yml index 29c2fc8e08476..b52ec158dd6d6 100644 --- a/.github/workflows/_linux-test.yml +++ b/.github/workflows/_linux-test.yml @@ -326,7 +326,7 @@ jobs: SCCACHE_BUCKET: ${{ !contains(matrix.runner, 'b200') && 'ossci-compiler-cache-circleci-v2' || '' }} SCCACHE_REGION: ${{ !contains(matrix.runner, 'b200') && 'us-east-1' || '' }} SHM_SIZE: ${{ contains(inputs.build-environment, 'cuda') && '2g' || '1g' }} - DOCKER_IMAGE: ${{ inputs.docker-image }} + DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }} XLA_CUDA: ${{ contains(inputs.build-environment, 'xla') && '0' || '' }} XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla PYTORCH_TEST_CUDA_MEM_LEAK_CHECK: ${{ matrix.mem_leak_check && '1' || '0' }} diff --git a/.github/workflows/_rocm-test.yml b/.github/workflows/_rocm-test.yml index 43ed76a63cc67..608aeba53e6d8 100644 --- a/.github/workflows/_rocm-test.yml +++ b/.github/workflows/_rocm-test.yml @@ -97,8 +97,8 @@ jobs: shell: bash run: | ngpu=$(rocminfo | grep -c -E 'Name:.*\sgfx') - if [[ $ngpu -lt 4 ]]; then - echo "Error: only $ngpu GPU(s) detected, at least 4 GPUs are needed for distributed jobs" + if [[ $ngpu -lt 2 ]]; then #We are temporarily reducing this down to 2 from 4 so that we can run tests on nodes with less gpus. + echo "Error: only $ngpu GPU(s) detected, at least 2 GPUs are needed for distributed jobs" exit 1 fi diff --git a/.github/workflows/_xpu-test.yml b/.github/workflows/_xpu-test.yml index e68bc6ead3a26..d27325b8a63dc 100644 --- a/.github/workflows/_xpu-test.yml +++ b/.github/workflows/_xpu-test.yml @@ -344,5 +344,21 @@ jobs: if-no-files-found: ignore path: ./**/core.[1-9]* + - name: Authenticate with AWS + uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0 + with: + role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_upload-benchmark-results + # The max duration enforced by the server side + role-duration-seconds: 18000 + aws-region: us-east-1 + + - name: Upload the benchmark results + uses: pytorch/test-infra/.github/actions/upload-benchmark-results@main + with: + benchmark-results-dir: test/test-reports + dry-run: false + schema-version: v3 + github-token: ${{ secrets.GITHUB_TOKEN }} + - name: Teardown XPU uses: ./.github/actions/teardown-xpu diff --git a/.github/workflows/attention_op_microbenchmark.yml b/.github/workflows/attention_op_microbenchmark.yml new file mode 100644 index 0000000000000..e01bc49621dcf --- /dev/null +++ b/.github/workflows/attention_op_microbenchmark.yml @@ -0,0 +1,73 @@ +name: attention_op_microbenchmark + +on: + push: + tags: + - ciflow/op-benchmark/* + workflow_dispatch: + schedule: + # Run at 06:00 UTC everyday + - cron: 0 7 * * * + +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }} + cancel-in-progress: true + +permissions: + id-token: write + contents: read + +jobs: + attn-microbenchmark-build: + if: github.repository_owner == 'pytorch' + uses: ./.github/workflows/_linux-build.yml + with: + runner: linux.12xlarge.memory + build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80 + docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11 + cuda-arch-list: '8.0 9.0' + test-matrix: | + { include: [ + { config: "attention_microbenchmark_test", shard: 1, num_shards: 1, runner: "linux.aws.a100" }, + { config: "attention_microbenchmark_test", shard: 1, num_shards: 1, runner: "linux.aws.h100" }, + ]} + secrets: inherit + + attn-microbenchmark-test: + name: attn-microbenchmark-test + uses: ./.github/workflows/_linux-test.yml + needs: attn-microbenchmark-build + with: + timeout-minutes: 500 + build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80 + docker-image: ${{ needs.attn-microbenchmark-build.outputs.docker-image }} + test-matrix: ${{ needs.attn-microbenchmark-build.outputs.test-matrix }} + secrets: inherit + + # B200 runner + opmicrobenchmark-build-b200: + if: github.repository_owner == 'pytorch' + name: opmicrobenchmark-build-b200 + uses: ./.github/workflows/_linux-build.yml + with: + runner: linux.12xlarge.memory + build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm100 + docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11 + cuda-arch-list: '10.0' + test-matrix: | + { include: [ + { config: "operator_microbenchmark_test", shard: 1, num_shards: 1, runner: "linux.dgx.b200" }, + ]} + secrets: inherit + + opmicrobenchmark-test-b200: + name: opmicrobenchmark-test-b200 + uses: ./.github/workflows/_linux-test.yml + needs: opmicrobenchmark-build-b200 + with: + timeout-minutes: 500 + build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm100 + docker-image: ${{ needs.opmicrobenchmark-build-b200.outputs.docker-image }} + test-matrix: ${{ needs.opmicrobenchmark-build-b200.outputs.test-matrix }} + aws-role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only + secrets: inherit diff --git a/.github/workflows/docker-builds.yml b/.github/workflows/docker-builds.yml index 6fbe2e846d40b..408a8f0000504 100644 --- a/.github/workflows/docker-builds.yml +++ b/.github/workflows/docker-builds.yml @@ -56,6 +56,8 @@ jobs: pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9, pytorch-linux-jammy-cuda12.4-cudnn9-py3-gcc11, pytorch-linux-jammy-py3.10-clang12, + pytorch-linux-jammy-py3.11-clang12, + pytorch-linux-jammy-py3.12-clang12, pytorch-linux-jammy-py3.13-clang12, pytorch-linux-jammy-py3.14-clang12, pytorch-linux-jammy-rocm-n-py3, @@ -65,9 +67,10 @@ jobs: pytorch-linux-jammy-py3.10-gcc11, pytorch-linux-jammy-py3-gcc11-inductor-benchmarks, pytorch-linux-jammy-py3.12-halide, + pytorch-linux-jammy-cuda12.8-py3.12-pallas, pytorch-linux-jammy-xpu-n-1-py3, - pytorch-linux-jammy-xpu-n-py3, - pytorch-linux-jammy-xpu-n-py3-inductor-benchmarks, + pytorch-linux-noble-xpu-n-py3, + pytorch-linux-noble-xpu-n-py3-inductor-benchmarks, pytorch-linux-jammy-py3-clang18-asan, pytorch-linux-jammy-py3-clang12-onnx, pytorch-linux-jammy-linter, @@ -77,9 +80,11 @@ jobs: pytorch-linux-noble-riscv64-py3.12-gcc14 ] include: - - docker-image-name: pytorch-linux-jammy-aarch64-py3.10-gcc11 + - docker-image-name: pytorch-linux-jammy-aarch64-py3.10-gcc13 runner: linux.arm64.m7g.4xlarge - - docker-image-name: pytorch-linux-jammy-aarch64-py3.10-gcc11-inductor-benchmarks + - docker-image-name: pytorch-linux-jammy-aarch64-py3.10-clang21 + runner: linux.arm64.m7g.4xlarge + - docker-image-name: pytorch-linux-jammy-aarch64-py3.10-gcc13-inductor-benchmarks runner: linux.arm64.m7g.4xlarge timeout-minutes: 600 # Docker uploads fail from LF runners, see https://github.com/pytorch/pytorch/pull/137358 @@ -114,6 +119,22 @@ jobs: with: docker-image: ${{ steps.build-docker-image.outputs.docker-image }} + - name: Generate output + if: contains(matrix.docker-image-name, 'rocm') + id: generate_output + run: | + docker_image_name="${{ matrix.docker-image-name }}" + docker_image_tag="${{ steps.build-docker-image.outputs.docker-image }}" + echo "${docker_image_name}=${docker_image_tag}" >> docker-builds-output-${docker_image_name}.txt + + - name: Upload artifacts + uses: actions/upload-artifact@v4.4.0 + if: contains(matrix.docker-image-name, 'rocm') + with: + name: docker-builds-artifacts-${{ matrix.docker-image-name }} + retention-days: 14 + path: ./docker-builds-output-${{ matrix.docker-image-name }}.txt + - uses: nick-fields/retry@7152eba30c6575329ac0576536151aca5a72780e # v3.0.0 name: Push to https://ghcr.io/ id: push-to-ghcr-io diff --git a/.github/workflows/docker-cache-mi300.yml b/.github/workflows/docker-cache-mi300.yml deleted file mode 100644 index 02c1171c567aa..0000000000000 --- a/.github/workflows/docker-cache-mi300.yml +++ /dev/null @@ -1,55 +0,0 @@ -name: docker-cache-mi300 - -on: - # run every 6 hours - schedule: - - cron: 0 0,6,12,18 * * * - workflow_dispatch: - -concurrency: - group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name }} - cancel-in-progress: true - -permissions: - id-token: write - contents: read - -jobs: - docker-cache: - if: github.repository_owner == 'pytorch' - runs-on: rocm-docker - steps: - - name: Checkout PyTorch - uses: pytorch/pytorch/.github/actions/checkout-pytorch@main - with: - no-sudo: true - - - name: configure aws credentials - id: aws_creds - uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0 - with: - role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only - aws-region: us-east-1 - role-duration-seconds: 18000 - - - name: Login to Amazon ECR - id: login-ecr - continue-on-error: false - uses: aws-actions/amazon-ecr-login@062b18b96a7aff071d4dc91bc00c4c1a7945b076 # v2.0.1 - - - name: Calculate docker image - id: calculate-docker-image - uses: pytorch/test-infra/.github/actions/calculate-docker-image@main - with: - docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3 - push: false - - - name: Pull docker image - uses: pytorch/test-infra/.github/actions/pull-docker-image@main - with: - docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} - - - name: Tar and upload to S3 bucket - run: | - sudo docker save -o ~/docker-data/pytorch/pytorch_docker_image.tar ${{ steps.calculate-docker-image.outputs.docker-image }} - sudo rclone copy -P --s3-upload-concurrency 64 --s3-chunk-size 200M --s3-upload-cutoff 300M ~/docker-data/pytorch/pytorch_docker_image.tar oci:pytorchbucket0002/pytorch_docker_image --progress diff --git a/.github/workflows/docker-cache-rocm.yml b/.github/workflows/docker-cache-rocm.yml new file mode 100644 index 0000000000000..78d38de3ac69a --- /dev/null +++ b/.github/workflows/docker-cache-rocm.yml @@ -0,0 +1,105 @@ +name: docker-cache-rocm + +on: + workflow_run: + workflows: [docker-builds] + branches: [main, release] + types: + - completed + workflow_dispatch: + +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name }} + cancel-in-progress: true + +permissions: + id-token: write + contents: read + actions: read + +jobs: + download-docker-builds-artifacts: + if: github.repository_owner == 'pytorch' + name: download-docker-builds-artifacts + runs-on: ubuntu-latest + outputs: + pytorch-linux-jammy-rocm-n-py3: ${{ steps.process-artifacts.outputs.pytorch-linux-jammy-rocm-n-py3 }} + pytorch-linux-noble-rocm-n-py3: ${{ steps.process-artifacts.outputs.pytorch-linux-noble-rocm-n-py3 }} + pytorch-linux-jammy-rocm-n-py3-benchmarks: ${{ steps.process-artifacts.outputs.pytorch-linux-jammy-rocm-n-py3-benchmarks }} + steps: + - name: Download artifacts + uses: actions/download-artifact@v4.1.7 + with: + run-id: ${{ github.event.workflow_run.id }} + path: ./docker-builds-artifacts + merge-multiple: true + github-token: ${{ secrets.GITHUB_TOKEN }} + + - name: Process artifacts + id: process-artifacts + run: | + ls -R ./docker-builds-artifacts + cat ./docker-builds-artifacts/*txt >> "${GITHUB_OUTPUT}" + cat "${GITHUB_OUTPUT}" + + docker-cache: + if: github.repository_owner == 'pytorch' + needs: download-docker-builds-artifacts + strategy: + fail-fast: false + matrix: + runner: [linux.rocm.gfx942.docker-cache] + docker-image: [ + "${{ needs.download-docker-builds-artifacts.outputs.pytorch-linux-jammy-rocm-n-py3 }}", + "${{ needs.download-docker-builds-artifacts.outputs.pytorch-linux-noble-rocm-n-py3 }}", + "${{ needs.download-docker-builds-artifacts.outputs.pytorch-linux-jammy-rocm-n-py3-benchmarks }}" + ] + runs-on: "${{ matrix.runner }}" + steps: + - name: debug + run: | + JSON_STRINGIFIED="${{ toJSON(needs.download-docker-builds-artifacts.outputs) }}" + echo "Outputs of download-docker-builds-artifacts job: ${JSON_STRINGIFIED}" + + - name: configure aws credentials + id: aws_creds + uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0 + with: + role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only + aws-region: us-east-1 + role-duration-seconds: 18000 + + - name: Login to Amazon ECR + id: login-ecr + continue-on-error: false + uses: aws-actions/amazon-ecr-login@062b18b96a7aff071d4dc91bc00c4c1a7945b076 # v2.0.1 + + - name: Generate ghrc.io tag + id: ghcr-io-tag + run: | + ecr_image="${{ matrix.docker-image }}" + ghcr_image="ghcr.io/pytorch/ci-image:${ecr_image##*:}" + echo "ghcr_image=${ghcr_image}" >> "$GITHUB_OUTPUT" + + - name: Pull docker image + uses: pytorch/test-infra/.github/actions/pull-docker-image@main + with: + docker-image: ${{ steps.ghcr-io-tag.outputs.ghcr_image }} + + - name: Save as tarball + run: | + docker_image_tag=${{ matrix.docker-image }} + docker_image_tag="${docker_image_tag#*:}" # Remove everything before and including first ":" + docker_image_tag="${docker_image_tag%-*}" # Remove everything after and including last "-" + ref_name=${{ github.event.workflow_run.head_branch }} + if [[ $ref_name =~ "release/" ]]; then + ref_suffix="release" + elif [[ $ref_name == "main" ]]; then + ref_suffix="main" + else + echo "Unexpected branch in ref_name: ${ref_name}" && exit 1 + fi + docker tag ${{ steps.ghcr-io-tag.outputs.ghcr_image }} ${{ matrix.docker-image }} + # mv is atomic operation, so we use intermediate tar.tmp file to prevent read-write contention + docker save -o ~/pytorch-data/docker/${docker_image_tag}.tar.tmp ${{ matrix.docker-image }} + mv ~/pytorch-data/docker/${docker_image_tag}.tar.tmp ~/pytorch-data/docker/${docker_image_tag}_${ref_suffix}.tar diff --git a/.github/workflows/dynamo-unittest.yml b/.github/workflows/dynamo-unittest.yml new file mode 100644 index 0000000000000..e1399b1376de4 --- /dev/null +++ b/.github/workflows/dynamo-unittest.yml @@ -0,0 +1,70 @@ +# Workflow: Dynamo Unit Test +# runs unit tests for dynamo. +name: dynamo-unittest + +on: + push: + tags: + - ciflow/dynamo/* + workflow_call: + schedule: + - cron: 29 8 * * * # about 1:29am PDT + +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} + cancel-in-progress: true + +permissions: + id-token: write + contents: read + +jobs: + get-label-type: + name: get-label-type + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main + if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }} + with: + triggering_actor: ${{ github.triggering_actor }} + issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} + curr_branch: ${{ github.head_ref || github.ref_name }} + curr_ref_type: ${{ github.ref_type }} + opt_out_experiments: lf + + dynamo-build: + name: dynamo-build + uses: ./.github/workflows/_linux-build.yml + needs: get-label-type + strategy: + matrix: + python-version: ['3.11', '3.12'] + with: + runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + build-environment: linux-jammy-py${{ matrix.python-version }}-clang12 + docker-image-name: ci-image:pytorch-linux-jammy-py${{ matrix.python-version }}-clang12 + test-matrix: | + { include: [ + { config: "dynamo_core", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.c7i.2xlarge" }, + { config: "dynamo_wrapped", shard: 1, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.c7i.2xlarge" }, + { config: "dynamo_wrapped", shard: 2, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.c7i.2xlarge" }, + { config: "dynamo_wrapped", shard: 3, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.c7i.2xlarge" }, + ]} + secrets: inherit + + dynamo-test: + name: dynamo-test + uses: ./.github/workflows/_linux-test.yml + needs: [get-label-type, dynamo-build] + strategy: + matrix: + python-version: ['3.11', '3.12'] + with: + build-environment: linux-jammy-py${{ matrix.python-version }}-clang12 + docker-image: ci-image:pytorch-linux-jammy-py${{ matrix.python-version }}-clang12 + test-matrix: | + { include: [ + { config: "dynamo_core", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.c7i.2xlarge" }, + { config: "dynamo_wrapped", shard: 1, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.c7i.2xlarge" }, + { config: "dynamo_wrapped", shard: 2, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.c7i.2xlarge" }, + { config: "dynamo_wrapped", shard: 3, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.c7i.2xlarge" }, + ]} + secrets: inherit diff --git a/.github/workflows/h100-distributed.yml b/.github/workflows/h100-distributed.yml index be19b8f961f4d..c05b61e30a635 100644 --- a/.github/workflows/h100-distributed.yml +++ b/.github/workflows/h100-distributed.yml @@ -37,7 +37,6 @@ jobs: needs: get-label-type with: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - runner: "linux.c7i.12xlarge" build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm90-dist docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11 cuda-arch-list: '9.0' diff --git a/.github/workflows/inductor-perf-test-nightly-aarch64.yml b/.github/workflows/inductor-perf-test-nightly-aarch64.yml index e16c8be79130d..46a1966570c63 100644 --- a/.github/workflows/inductor-perf-test-nightly-aarch64.yml +++ b/.github/workflows/inductor-perf-test-nightly-aarch64.yml @@ -72,7 +72,7 @@ jobs: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" runner: linux.arm64.m7g.4xlarge build-environment: linux-jammy-aarch64-py3.10 - docker-image-name: ci-image:pytorch-linux-jammy-aarch64-py3.10-gcc11-inductor-benchmarks + docker-image-name: ci-image:pytorch-linux-jammy-aarch64-py3.10-gcc13-inductor-benchmarks test-matrix: | { include: [ { config: "inductor_huggingface_perf_cpu_aarch64", shard: 1, num_shards: 9, runner: "linux.arm64.m7g.metal" }, diff --git a/.github/workflows/inductor-perf-test-nightly-xpu.yml b/.github/workflows/inductor-perf-test-nightly-xpu.yml index c2db8c310e368..28b10996bf38a 100644 --- a/.github/workflows/inductor-perf-test-nightly-xpu.yml +++ b/.github/workflows/inductor-perf-test-nightly-xpu.yml @@ -83,8 +83,8 @@ jobs: needs: get-label-type with: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - build-environment: linux-jammy-xpu-n-py3.10 - docker-image-name: ci-image:pytorch-linux-jammy-xpu-n-py3-inductor-benchmarks + build-environment: linux-noble-xpu-n-py3.10 + docker-image-name: ci-image:pytorch-linux-noble-xpu-n-py3-inductor-benchmarks runner: linux.c7i.12xlarge test-matrix: | { include: [ @@ -117,7 +117,7 @@ jobs: uses: ./.github/workflows/_xpu-test.yml needs: xpu-n-py3_10-inductor-benchmark-build with: - build-environment: linux-jammy-xpu-n-py3.10 + build-environment: linux-noble-xpu-n-py3.10 dashboard-tag: training-true-inference-true-default-true-dynamic-true-cudagraphs-false-cppwrapper-true-aotinductor-true-freezing_cudagraphs-false-cudagraphs_low_precision-false docker-image: ${{ needs.xpu-n-py3_10-inductor-benchmark-build.outputs.docker-image }} test-matrix: ${{ needs.xpu-n-py3_10-inductor-benchmark-build.outputs.test-matrix }} @@ -137,7 +137,7 @@ jobs: uses: ./.github/workflows/_xpu-test.yml needs: xpu-n-py3_10-inductor-benchmark-build with: - build-environment: linux-jammy-xpu-n-py3.10 + build-environment: linux-noble-xpu-n-py3.10 dashboard-tag: training-${{ inputs.training }}-inference-${{ inputs.inference }}-default-${{ inputs.default }}-dynamic-${{ inputs.dynamic }}-cudagraphs-${{ inputs.cudagraphs }}-cppwrapper-${{ inputs.cppwrapper }}-aotinductor-${{ inputs.aotinductor }}-maxautotune-${{ inputs.maxautotune }}-freezing_cudagraphs-${{ inputs.freezing_cudagraphs }}-cudagraphs_low_precision-${{ inputs.cudagraphs }} docker-image: ${{ needs.xpu-n-py3_10-inductor-benchmark-build.outputs.docker-image }} test-matrix: ${{ needs.xpu-n-py3_10-inductor-benchmark-build.outputs.test-matrix }} diff --git a/.github/workflows/inductor-rocm.yml b/.github/workflows/inductor-rocm-mi200.yml similarity index 95% rename from .github/workflows/inductor-rocm.yml rename to .github/workflows/inductor-rocm-mi200.yml index b2ff53a645481..55de9a2121cf6 100644 --- a/.github/workflows/inductor-rocm.yml +++ b/.github/workflows/inductor-rocm-mi200.yml @@ -1,13 +1,13 @@ -name: inductor-rocm +name: inductor-rocm-mi200 on: schedule: - - cron: 0 * * * * + - cron: 0 */3 * * * push: branches: - release/* tags: - - ciflow/inductor-rocm/* + - ciflow/inductor-rocm-mi200/* workflow_dispatch: concurrency: diff --git a/.github/workflows/inductor-rocm-mi300.yml b/.github/workflows/inductor-rocm-mi300.yml index 732ec7eb85f3e..dee10a0db3c16 100644 --- a/.github/workflows/inductor-rocm-mi300.yml +++ b/.github/workflows/inductor-rocm-mi300.yml @@ -7,6 +7,7 @@ on: - release/* tags: - ciflow/inductor-rocm/* + - ciflow/inductor-rocm-mi300/* workflow_dispatch: concurrency: diff --git a/.github/workflows/inductor-unittest.yml b/.github/workflows/inductor-unittest.yml index 6ab276a57fc4d..ca9b57cab2ddb 100644 --- a/.github/workflows/inductor-unittest.yml +++ b/.github/workflows/inductor-unittest.yml @@ -81,6 +81,32 @@ jobs: test-matrix: ${{ needs.inductor-halide-build.outputs.test-matrix }} secrets: inherit + inductor-pallas-build: + name: inductor-pallas-build + uses: ./.github/workflows/_linux-build.yml + needs: get-label-type + with: + build-environment: linux-jammy-cuda12.8-py3.12-gcc11 + docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-py3.12-pallas + cuda-arch-list: '8.9' + runner: linux.8xlarge.memory + runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + test-matrix: | + { include: [ + { config: "inductor-pallas", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.12xlarge.nvidia.gpu" }, + ]} + secrets: inherit + + inductor-pallas-test: + name: inductor-pallas-test + uses: ./.github/workflows/_linux-test.yml + needs: inductor-pallas-build + with: + build-environment: linux-jammy-py3.12-gcc11 + docker-image: ${{ needs.inductor-pallas-build.outputs.docker-image }} + test-matrix: ${{ needs.inductor-pallas-build.outputs.test-matrix }} + secrets: inherit + inductor-triton-cpu-build: name: inductor-triton-cpu-build uses: ./.github/workflows/_linux-build.yml @@ -115,10 +141,10 @@ jobs: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" test-matrix: | { include: [ - { config: "inductor_amx", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.8xlarge.amx" }, - { config: "inductor_amx", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.8xlarge.amx" }, - { config: "inductor_avx2", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.10xlarge.avx2" }, - { config: "inductor_avx2", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.10xlarge.avx2" }, + { config: "inductor_amx", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge.amx" }, + { config: "inductor_amx", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge.amx" }, + { config: "inductor_avx2", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge.avx2" }, + { config: "inductor_avx2", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge.avx2" }, ]} secrets: inherit diff --git a/.github/workflows/inductor.yml b/.github/workflows/inductor.yml index 2616141c0dc2a..8a913c3b36a11 100644 --- a/.github/workflows/inductor.yml +++ b/.github/workflows/inductor.yml @@ -84,13 +84,13 @@ jobs: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" test-matrix: | { include: [ - { config: "cpu_inductor_torchbench", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.8xlarge.amx" }, - { config: "cpu_inductor_torchbench", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.8xlarge.amx" }, - { config: "dynamic_cpu_inductor_huggingface", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.8xlarge.amx" }, - { config: "dynamic_cpu_inductor_timm", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.8xlarge.amx" }, - { config: "dynamic_cpu_inductor_timm", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.8xlarge.amx" }, - { config: "dynamic_cpu_inductor_torchbench", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.8xlarge.amx" }, - { config: "dynamic_cpu_inductor_torchbench", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.8xlarge.amx" }, + { config: "cpu_inductor_torchbench", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge.amx" }, + { config: "cpu_inductor_torchbench", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge.amx" }, + { config: "dynamic_cpu_inductor_huggingface", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge.amx" }, + { config: "dynamic_cpu_inductor_timm", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge.amx" }, + { config: "dynamic_cpu_inductor_timm", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge.amx" }, + { config: "dynamic_cpu_inductor_torchbench", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge.amx" }, + { config: "dynamic_cpu_inductor_torchbench", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge.amx" }, { config: "inductor_torchbench_cpu_smoketest_perf", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.24xl.spr-metal" }, ]} build-additional-packages: "vision audio torchao" diff --git a/.github/workflows/linux-aarch64.yml b/.github/workflows/linux-aarch64.yml index 2b840a39a5c21..e6690b1043006 100644 --- a/.github/workflows/linux-aarch64.yml +++ b/.github/workflows/linux-aarch64.yml @@ -33,7 +33,7 @@ jobs: with: runner_prefix: ${{ needs.get-label-type.outputs.label-type }} build-environment: linux-jammy-aarch64-py3.10 - docker-image-name: ci-image:pytorch-linux-jammy-aarch64-py3.10-gcc11 + docker-image-name: ci-image:pytorch-linux-jammy-aarch64-py3.10-gcc13 runner: linux.arm64.m7g.4xlarge test-matrix: | { include: [ diff --git a/.github/workflows/nightly.yml b/.github/workflows/nightly.yml index 0682dd2144afd..c47b0c5763078 100644 --- a/.github/workflows/nightly.yml +++ b/.github/workflows/nightly.yml @@ -5,9 +5,11 @@ on: - cron: 0 0 * * * push: tags: - # NOTE: Doc build pipelines should only get triggered on release candidate builds - # Release candidate tags look like: v1.11.0-rc1 - - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+ + # NOTE: Doc build pipelines should only get triggered on: + # Major or minor release candidates builds + - v[0-9]+.[0-9]+.0+-rc[0-9]+ + # Final RC for major, minor and patch releases + - v[0-9]+.[0-9]+.[0-9]+ - ciflow/nightly/* workflow_dispatch: diff --git a/.github/workflows/operator_benchmark.yml b/.github/workflows/operator_benchmark.yml index 40fb3b8d0c85f..758147f5fe18e 100644 --- a/.github/workflows/operator_benchmark.yml +++ b/.github/workflows/operator_benchmark.yml @@ -60,7 +60,7 @@ jobs: with: build-environment: linux-jammy-aarch64-py3.10 runner: linux.arm64.m7g.4xlarge - docker-image-name: ci-image:pytorch-linux-jammy-aarch64-py3.10-gcc11 + docker-image-name: ci-image:pytorch-linux-jammy-aarch64-py3.10-gcc13 test-matrix: | { include: [ { config: "cpu_operator_benchmark_short", shard: 1, num_shards: 1, runner: "linux.arm64.m8g.4xlarge" }, diff --git a/.github/workflows/periodic-rocm-mi200.yml b/.github/workflows/periodic-rocm-mi200.yml index 6b65bf05cbde0..18e7b60570bf8 100644 --- a/.github/workflows/periodic-rocm-mi200.yml +++ b/.github/workflows/periodic-rocm-mi200.yml @@ -11,7 +11,6 @@ on: - cron: 29 8 * * * # about 1:29am PDT, for mem leak check and rerun disabled tests push: tags: - - ciflow/periodic/* - ciflow/periodic-rocm-mi200/* branches: - release/* diff --git a/.github/workflows/periodic-rocm-mi300.yml b/.github/workflows/periodic-rocm-mi300.yml index 4d8890e69fc73..ce68ee8bc8e03 100644 --- a/.github/workflows/periodic-rocm-mi300.yml +++ b/.github/workflows/periodic-rocm-mi300.yml @@ -11,6 +11,7 @@ on: - cron: 29 8 * * * # about 1:29am PDT, for mem leak check and rerun disabled tests push: tags: + - ciflow/periodic/* - ciflow/periodic-rocm-mi300/* branches: - release/* diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml index e3af55e736503..51e211a5ad2ad 100644 --- a/.github/workflows/pull.yml +++ b/.github/workflows/pull.yml @@ -70,6 +70,7 @@ jobs: { config: "distributed", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" }, { config: "distributed", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" }, { config: "numpy_2_x", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.c7i.2xlarge" }, + { config: "libtorch_agnostic_targetting", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" }, ]} secrets: inherit @@ -342,16 +343,16 @@ jobs: test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-build.outputs.test-matrix }} secrets: inherit - linux-jammy-xpu-n-py3_10-build: - name: linux-jammy-xpu-n-py3.10 + linux-noble-xpu-n-py3_10-build: + name: linux-noble-xpu-n-py3.10 uses: ./.github/workflows/_linux-build.yml needs: get-label-type with: # This should sync with the build in xpu.yml but xpu uses a larger runner # sync-tag: linux-xpu-n-build runner_prefix: ${{ needs.get-label-type.outputs.label-type }} - build-environment: linux-jammy-xpu-n-py3.10 - docker-image-name: ci-image:pytorch-linux-jammy-xpu-n-py3 + build-environment: linux-noble-xpu-n-py3.10 + docker-image-name: ci-image:pytorch-linux-noble-xpu-n-py3 test-matrix: | { include: [ { config: "default", shard: 1, num_shards: 4, runner: "linux.idc.xpu" }, diff --git a/.github/workflows/rocm.yml b/.github/workflows/rocm-mi200.yml similarity index 97% rename from .github/workflows/rocm.yml rename to .github/workflows/rocm-mi200.yml index ffe6efbe0433c..c947e361bfcb5 100644 --- a/.github/workflows/rocm.yml +++ b/.github/workflows/rocm-mi200.yml @@ -1,15 +1,16 @@ -name: rocm +name: rocm-mi200 on: push: branches: - release/* tags: - - ciflow/rocm/* + - ciflow/rocm-mi200/* workflow_dispatch: schedule: - cron: 29 8 * * * # about 1:29am PDT - - cron: 0 * * * * + - cron: 0 */3 * * * + concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }} diff --git a/.github/workflows/rocm-mi300.yml b/.github/workflows/rocm-mi300.yml index c50111d068d24..d20b37be20876 100644 --- a/.github/workflows/rocm-mi300.yml +++ b/.github/workflows/rocm-mi300.yml @@ -6,6 +6,7 @@ on: - main - release/* tags: + - ciflow/rocm/* - ciflow/rocm-mi300/* workflow_dispatch: schedule: diff --git a/.github/workflows/slow-rocm-mi200.yml b/.github/workflows/slow-rocm-mi200.yml new file mode 100644 index 0000000000000..c564857dca9ce --- /dev/null +++ b/.github/workflows/slow-rocm-mi200.yml @@ -0,0 +1,81 @@ +# This workflow is dedicated to host slow jobs that are run only periodically because +# they are too slow to run in every commit. The list of slow tests can be found in +# https://github.com/pytorch/test-infra/blob/generated-stats/stats/slow-tests.json +name: slow-rocm-mi200 + +on: + push: + branches: + - release/* + tags: + - ciflow/slow/* + - ciflow/slow-rocm-mi200/* + schedule: + - cron: 0 */3 * * * + workflow_dispatch: + +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}-${{ github.event.schedule }} + cancel-in-progress: true + +permissions: + id-token: write + contents: read + +jobs: + llm-td: + if: github.repository_owner == 'pytorch' + name: before-test + uses: ./.github/workflows/llm_td_retrieval.yml + permissions: + id-token: write + contents: read + + target-determination: + name: before-test + uses: ./.github/workflows/target_determination.yml + needs: llm-td + permissions: + id-token: write + contents: read + + get-label-type: + name: get-label-type + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main + if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }} + with: + triggering_actor: ${{ github.triggering_actor }} + issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} + curr_branch: ${{ github.head_ref || github.ref_name }} + curr_ref_type: ${{ github.ref_type }} + + linux-jammy-rocm-py3_10-build: + name: linux-jammy-rocm-py3.10 + uses: ./.github/workflows/_linux-build.yml + needs: get-label-type + with: + runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + build-environment: linux-jammy-rocm-py3.10 + docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3 + sync-tag: rocm-build + test-matrix: | + { include: [ + { config: "slow", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.2", owners: ["module:rocm"] }, + { config: "slow", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.2", owners: ["module:rocm"] }, + ]} + secrets: inherit + + linux-jammy-rocm-py3_10-test: + permissions: + id-token: write + contents: read + name: linux-jammy-rocm-py3.10 + uses: ./.github/workflows/_rocm-test.yml + needs: + - linux-jammy-rocm-py3_10-build + - target-determination + with: + build-environment: linux-jammy-rocm-py3.10 + docker-image: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.docker-image }} + test-matrix: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.test-matrix }} + secrets: inherit diff --git a/.github/workflows/slow.yml b/.github/workflows/slow.yml index d4992a2ddb2cf..c14caee9a336c 100644 --- a/.github/workflows/slow.yml +++ b/.github/workflows/slow.yml @@ -105,36 +105,6 @@ jobs: test-matrix: ${{ needs.linux-jammy-py3_10-clang12-build.outputs.test-matrix }} secrets: inherit - linux-jammy-rocm-py3_10-build: - name: linux-jammy-rocm-py3.10 - uses: ./.github/workflows/_linux-build.yml - needs: get-label-type - with: - runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - build-environment: linux-jammy-rocm-py3.10 - docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3 - test-matrix: | - { include: [ - { config: "slow", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.2", owners: ["module:rocm"] }, - { config: "slow", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.2", owners: ["module:rocm"] }, - ]} - secrets: inherit - - linux-jammy-rocm-py3_10-test: - permissions: - id-token: write - contents: read - name: linux-jammy-rocm-py3.10 - uses: ./.github/workflows/_rocm-test.yml - needs: - - linux-jammy-rocm-py3_10-build - - target-determination - with: - build-environment: linux-jammy-rocm-py3.10 - docker-image: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.docker-image }} - test-matrix: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.test-matrix }} - secrets: inherit - linux-jammy-py3_10-clang18-asan-build: name: linux-jammy-py3.10-clang18-asan uses: ./.github/workflows/_linux-build.yml diff --git a/.github/workflows/test-b200.yml b/.github/workflows/test-b200.yml index ef7f75bc4b2b4..07fd9b18fdada 100644 --- a/.github/workflows/test-b200.yml +++ b/.github/workflows/test-b200.yml @@ -5,7 +5,9 @@ # Flow: # 1. Builds PyTorch with CUDA 12.8+ and sm100 architecture for B200 # 2. Runs smoke tests on linux.dgx.b200 runner -# 3. Tests executed are defined in .ci/pytorch/test.sh -> test_python_smoke() function +# 3. Tests executed are defined in .ci/pytorch/test.sh -> test_python_smoke_b200() function +# - Includes matmul, scaled_matmul, FP8, and FlashAttention CuTe tests +# - FlashAttention CuTe DSL is installed as part of test execution # # Triggered by: # - Pull requests modifying this workflow file diff --git a/.github/workflows/test-h100.yml b/.github/workflows/test-h100.yml index ec99f4473bb0b..510473d5306ad 100644 --- a/.github/workflows/test-h100.yml +++ b/.github/workflows/test-h100.yml @@ -41,7 +41,6 @@ jobs: needs: get-label-type with: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - runner: linux.12xlarge.memory build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm90 docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11 cuda-arch-list: '9.0' diff --git a/.github/workflows/trunk-rocm-mi300.yml b/.github/workflows/trunk-rocm-mi300.yml new file mode 100644 index 0000000000000..23ab5e9260a3e --- /dev/null +++ b/.github/workflows/trunk-rocm-mi300.yml @@ -0,0 +1,83 @@ +name: trunk-rocm-mi300 + +on: + push: + branches: + - main + - release/* + workflow_dispatch: + schedule: + - cron: 29 8 * * * # about 1:29am PDT + +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }} + cancel-in-progress: true + +permissions: + id-token: write + contents: read + +jobs: + llm-td: + if: github.repository_owner == 'pytorch' + name: before-test + uses: ./.github/workflows/llm_td_retrieval.yml + permissions: + id-token: write + contents: read + + target-determination: + name: before-test + uses: ./.github/workflows/target_determination.yml + needs: llm-td + permissions: + id-token: write + contents: read + + get-label-type: + name: get-label-type + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main + if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }} + with: + triggering_actor: ${{ github.triggering_actor }} + issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} + curr_branch: ${{ github.head_ref || github.ref_name }} + curr_ref_type: ${{ github.ref_type }} + + linux-jammy-rocm-py3_10-build: + name: linux-jammy-rocm-py3.10 + uses: ./.github/workflows/_linux-build.yml + needs: get-label-type + with: + runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + build-environment: linux-jammy-rocm-py3.10 + docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3 + sync-tag: rocm-build + test-matrix: | + { include: [ + { config: "default", shard: 1, num_shards: 6, runner: "linux.rocm.gpu.gfx942.1.b" }, + { config: "default", shard: 2, num_shards: 6, runner: "linux.rocm.gpu.gfx942.1.b" }, + { config: "default", shard: 3, num_shards: 6, runner: "linux.rocm.gpu.gfx942.1.b" }, + { config: "default", shard: 4, num_shards: 6, runner: "linux.rocm.gpu.gfx942.1.b" }, + { config: "default", shard: 5, num_shards: 6, runner: "linux.rocm.gpu.gfx942.1.b" }, + { config: "default", shard: 6, num_shards: 6, runner: "linux.rocm.gpu.gfx942.1.b" }, + { config: "distributed", shard: 1, num_shards: 3, runner: "linux.rocm.gpu.gfx942.4.b" }, + { config: "distributed", shard: 2, num_shards: 3, runner: "linux.rocm.gpu.gfx942.4.b" }, + { config: "distributed", shard: 3, num_shards: 3, runner: "linux.rocm.gpu.gfx942.4.b" }, + ]} + secrets: inherit + + linux-jammy-rocm-py3_10-test: + permissions: + id-token: write + contents: read + name: linux-jammy-rocm-py3.10 + uses: ./.github/workflows/_rocm-test.yml + needs: + - linux-jammy-rocm-py3_10-build + - target-determination + with: + build-environment: linux-jammy-rocm-py3.10 + docker-image: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.docker-image }} + test-matrix: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.test-matrix }} + secrets: inherit diff --git a/.github/workflows/trunk.yml b/.github/workflows/trunk.yml index 6ba810c3a9582..667c37727045b 100644 --- a/.github/workflows/trunk.yml +++ b/.github/workflows/trunk.yml @@ -83,6 +83,7 @@ jobs: { config: "distributed", shard: 2, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.12xlarge.nvidia.gpu" }, { config: "distributed", shard: 3, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.12xlarge.nvidia.gpu" }, { config: "pr_time_benchmarks", shard: 1, num_shards: 1, runner: "linux.g4dn.metal.nvidia.gpu" }, + { config: "libtorch_agnostic_targetting", shard: 1, num_shards: 1, runner: "linux.g4dn.metal.nvidia.gpu" }, ]} secrets: inherit diff --git a/.github/workflows/upload-test-stats.yml b/.github/workflows/upload-test-stats.yml index 24c3ab3db84f3..b3d8073aad3b3 100644 --- a/.github/workflows/upload-test-stats.yml +++ b/.github/workflows/upload-test-stats.yml @@ -5,21 +5,23 @@ on: workflows: - pull - trunk + - trunk-rocm-mi300 - periodic - periodic-rocm-mi200 - periodic-rocm-mi300 - inductor - unstable - slow + - slow-rocm-mi200 - unstable-periodic - inductor-periodic - - rocm + - rocm-mi200 - rocm-mi300 - rocm-mi355 - inductor-micro-benchmark - inductor-micro-benchmark-x86 - inductor-cu124 - - inductor-rocm + - inductor-rocm-mi200 - inductor-rocm-mi300 - mac-mps - linux-aarch64 diff --git a/.github/workflows/xpu.yml b/.github/workflows/xpu.yml index 36f603f70fde7..d9a1ba13d2b59 100644 --- a/.github/workflows/xpu.yml +++ b/.github/workflows/xpu.yml @@ -47,15 +47,15 @@ jobs: ]} secrets: inherit - linux-jammy-xpu-n-py3_10-build: - name: linux-jammy-xpu-n-py3.10 + linux-noble-xpu-n-py3_10-build: + name: linux-noble-xpu-n-py3.10 uses: ./.github/workflows/_linux-build.yml needs: get-label-type with: sync-tag: linux-xpu-n-build runner_prefix: ${{ needs.get-label-type.outputs.label-type }} - build-environment: linux-jammy-xpu-n-py3.10 - docker-image-name: ci-image:pytorch-linux-jammy-xpu-n-py3 + build-environment: linux-noble-xpu-n-py3.10 + docker-image-name: ci-image:pytorch-linux-noble-xpu-n-py3 runner: linux.c7i.12xlarge test-matrix: | { include: [ @@ -74,17 +74,17 @@ jobs: ]} secrets: inherit - linux-jammy-xpu-n-py3_10-test: - name: linux-jammy-xpu-n-py3.10 + linux-noble-xpu-n-py3_10-test: + name: linux-noble-xpu-n-py3.10 uses: ./.github/workflows/_xpu-test.yml - needs: linux-jammy-xpu-n-py3_10-build + needs: linux-noble-xpu-n-py3_10-build permissions: id-token: write contents: read with: - build-environment: linux-jammy-xpu-n-py3.10 - docker-image: ${{ needs.linux-jammy-xpu-n-py3_10-build.outputs.docker-image }} - test-matrix: ${{ needs.linux-jammy-xpu-n-py3_10-build.outputs.test-matrix }} + build-environment: linux-noble-xpu-n-py3.10 + docker-image: ${{ needs.linux-noble-xpu-n-py3_10-build.outputs.docker-image }} + test-matrix: ${{ needs.linux-noble-xpu-n-py3_10-build.outputs.test-matrix }} secrets: inherit windows-xpu-n-1-build: diff --git a/.lintrunner.toml b/.lintrunner.toml index cee0249ad96eb..7a6e241f90c8d 100644 --- a/.lintrunner.toml +++ b/.lintrunner.toml @@ -143,7 +143,8 @@ init_command = [ 'tools/linter/adapters/pip_init.py', '--dry-run={{DRYRUN}}', 'numpy==1.26.4 ; python_version >= "3.10" and python_version <= "3.11"', - 'numpy==2.1.0 ; python_version >= "3.12"', + 'numpy==2.1.0 ; python_version >= "3.12" and python_version <= "3.13"', + 'numpy==2.3.4 ; python_version >= "3.14"', 'expecttest==0.3.0', 'pyrefly==0.36.2', 'sympy==1.13.3', @@ -185,6 +186,8 @@ include_patterns = [ 'aten/src/ATen/native/nested/cuda/*.h', 'aten/src/ATen/native/nested/*.cpp', 'aten/src/ATen/native/nested/*.h', + 'aten/src/ATen/xpu/**/*.h', + 'aten/src/ATen/xpu/**/*.cpp', 'c10/**/*.cpp', 'c10/**/*.h', 'torch/*.h', @@ -1401,7 +1404,7 @@ init_command = [ '--dry-run={{DRYRUN}}', 'usort==1.0.8.post1', 'isort==6.0.1', - 'ruff==0.13.1', # sync with RUFF + 'ruff==0.14.4', # sync with RUFF ] is_formatter = true @@ -1536,7 +1539,7 @@ init_command = [ 'python3', 'tools/linter/adapters/pip_init.py', '--dry-run={{DRYRUN}}', - 'ruff==0.13.1', # sync with PYFMT + 'ruff==0.14.4', # sync with PYFMT ] is_formatter = true diff --git a/.spin/cmds.py b/.spin/cmds.py new file mode 100644 index 0000000000000..a81717c7423be --- /dev/null +++ b/.spin/cmds.py @@ -0,0 +1,330 @@ +import hashlib +import subprocess +import sys +from pathlib import Path + +import click +import spin + + +def file_digest(file, algorithm: str): + try: + return hashlib.file_digest(file, algorithm) + except AttributeError: + pass # Fallback to manual implementation below + hash = hashlib.new(algorithm) + while chunk := file.read(8192): + hash.update(chunk) + return hash + + +def _hash_file(file): + with open(file, "rb") as f: + hash = file_digest(f, "sha256") + return hash.hexdigest() + + +def _hash_files(files): + hashes = {file: _hash_file(file) for file in files} + return hashes + + +def _read_hashes(hash_file: Path): + if not hash_file.exists(): + return {} + with hash_file.open("r") as f: + lines = f.readlines() + hashes = {} + for line in lines: + hash = line[:64] + file = line[66:].strip() + hashes[file] = hash + return hashes + + +def _updated_hashes(hash_file, files_to_hash): + old_hashes = _read_hashes(hash_file) + new_hashes = _hash_files(files_to_hash) + if new_hashes != old_hashes: + return new_hashes + return None + + +@click.command() +def regenerate_version(): + """Regenerate version.py.""" + cmd = [ + sys.executable, + "-m", + "tools.generate_torch_version", + "--is-debug=false", + ] + spin.util.run(cmd) + + +TYPE_STUBS = [ + ( + "Pytorch type stubs", + Path(".lintbin/.pytorch-type-stubs.sha256"), + [ + "aten/src/ATen/native/native_functions.yaml", + "aten/src/ATen/native/tags.yaml", + "tools/autograd/deprecated.yaml", + ], + [ + sys.executable, + "-m", + "tools.pyi.gen_pyi", + "--native-functions-path", + "aten/src/ATen/native/native_functions.yaml", + "--tags-path", + "aten/src/ATen/native/tags.yaml", + "--deprecated-functions-path", + "tools/autograd/deprecated.yaml", + ], + ), + ( + "Datapipes type stubs", + None, + [], + [ + sys.executable, + "torch/utils/data/datapipes/gen_pyi.py", + ], + ), +] + + +@click.command() +def regenerate_type_stubs(): + """Regenerate type stubs.""" + for name, hash_file, files_to_hash, cmd in TYPE_STUBS: + if hash_file: + if hashes := _updated_hashes(hash_file, files_to_hash): + click.echo( + f"Changes detected in type stub files for {name}. Regenerating..." + ) + spin.util.run(cmd) + hash_file.parent.mkdir(parents=True, exist_ok=True) + with hash_file.open("w") as f: + for file, hash in hashes.items(): + f.write(f"{hash} {file}\n") + click.echo("Type stubs and hashes updated.") + else: + click.echo(f"No changes detected in type stub files for {name}.") + else: + click.echo(f"No hash file for {name}. Regenerating...") + spin.util.run(cmd) + click.echo("Type stubs regenerated.") + + +@click.command() +def regenerate_clangtidy_files(): + """Regenerate clang-tidy files.""" + cmd = [ + sys.executable, + "-m", + "tools.linter.clang_tidy.generate_build_files", + ] + spin.util.run(cmd) + + +#: These linters are expected to need less than 3s cpu time total +VERY_FAST_LINTERS = { + "ATEN_CPU_GPU_AGNOSTIC", + "BAZEL_LINTER", + "C10_NODISCARD", + "C10_UNUSED", + "CALL_ONCE", + "CMAKE_MINIMUM_REQUIRED", + "CONTEXT_DECORATOR", + "COPYRIGHT", + "CUBINCLUDE", + "DEPLOY_DETECTION", + "ERROR_PRONE_ISINSTANCE", + "EXEC", + "HEADER_ONLY_LINTER", + "IMPORT_LINTER", + "INCLUDE", + "LINTRUNNER_VERSION", + "MERGE_CONFLICTLESS_CSV", + "META_NO_CREATE_UNBACKED", + "NEWLINE", + "NOQA", + "NO_WORKFLOWS_ON_FORK", + "ONCE_FLAG", + "PYBIND11_INCLUDE", + "PYBIND11_SPECIALIZATION", + "PYPIDEP", + "PYPROJECT", + "RAWCUDA", + "RAWCUDADEVICE", + "ROOT_LOGGING", + "TABS", + "TESTOWNERS", + "TYPEIGNORE", + "TYPENOSKIP", + "WORKFLOWSYNC", +} + + +#: These linters are expected to take a few seconds, but less than 10s cpu time total +FAST_LINTERS = { + "CMAKE", + "DOCSTRING_LINTER", + "GHA", + "NATIVEFUNCTIONS", + "RUFF", + "SET_LINTER", + "SHELLCHECK", + "SPACES", +} + + +#: These linters are expected to take more than 10s cpu time total; +#: some need more than 1 hour. +SLOW_LINTERS = { + "ACTIONLINT", + "CLANGFORMAT", + "CLANGTIDY", + "CODESPELL", + "FLAKE8", + "GB_REGISTRY", + "PYFMT", + "PYREFLY", + "TEST_DEVICE_BIAS", + "TEST_HAS_MAIN", +} + + +ALL_LINTERS = VERY_FAST_LINTERS | FAST_LINTERS | SLOW_LINTERS + + +LINTRUNNER_CACHE_INFO = ( + Path(".lintbin/.lintrunner.sha256"), + [ + "requirements.txt", + "pyproject.toml", + ".lintrunner.toml", + ], +) + + +LINTRUNNER_BASE_CMD = [ + "uvx", + "--python", + "3.10", + "lintrunner@0.12.7", +] + + +@click.command() +def setup_lint(): + """Set up lintrunner with current CI version.""" + cmd = LINTRUNNER_BASE_CMD + ["init"] + subprocess.run(cmd, check=True, capture_output=True, text=True) + + +def _check_linters(): + cmd = LINTRUNNER_BASE_CMD + ["list"] + ret = spin.util.run(cmd, output=False, stderr=subprocess.PIPE) + linters = {l.strip() for l in ret.stdout.decode().strip().split("\n")[1:]} + unknown_linters = linters - ALL_LINTERS + missing_linters = ALL_LINTERS - linters + if unknown_linters: + click.secho( + f"Unknown linters found; please add them to the correct category " + f"in .spin/cmds.py: {', '.join(unknown_linters)}", + fg="yellow", + ) + if missing_linters: + click.secho( + f"Missing linters found; please update the corresponding category " + f"in .spin/cmds.py: {', '.join(missing_linters)}", + fg="yellow", + ) + return unknown_linters, missing_linters + + +@spin.util.extend_command( + setup_lint, + doc=f""" + If configuration has changed, update lintrunner. + + Compares the stored old hashes of configuration files with new ones and + performs setup via setup-lint if the hashes have changed. + Hashes are stored in {LINTRUNNER_CACHE_INFO[0]}; the following files are + considered: {", ".join(LINTRUNNER_CACHE_INFO[1])}. + """, +) +@click.pass_context +def lazy_setup_lint(ctx, parent_callback, **kwargs): + if hashes := _updated_hashes(*LINTRUNNER_CACHE_INFO): + click.echo( + "Changes detected in lint configuration files. Setting up linting tools..." + ) + parent_callback(**kwargs) + hash_file = LINTRUNNER_CACHE_INFO[0] + hash_file.parent.mkdir(parents=True, exist_ok=True) + with hash_file.open("w") as f: + for file, hash in hashes.items(): + f.write(f"{hash} {file}\n") + click.echo("Linting tools set up and hashes updated.") + else: + click.echo("No changes detected in lint configuration files. Skipping setup.") + click.echo("Regenerating version...") + ctx.invoke(regenerate_version) + click.echo("Regenerating type stubs...") + ctx.invoke(regenerate_type_stubs) + click.echo("Done.") + _check_linters() + + +@click.command() +@click.option("-a", "--apply-patches", is_flag=True) +@click.pass_context +def lint(ctx, apply_patches, **kwargs): + """Lint all files.""" + ctx.invoke(lazy_setup_lint) + all_files_linters = VERY_FAST_LINTERS | FAST_LINTERS + changed_files_linters = SLOW_LINTERS + cmd = LINTRUNNER_BASE_CMD + if apply_patches: + cmd += ["--apply-patches"] + all_files_cmd = cmd + [ + "--take", + ",".join(all_files_linters), + "--all-files", + ] + spin.util.run(all_files_cmd) + changed_files_cmd = cmd + [ + "--take", + ",".join(changed_files_linters), + ] + spin.util.run(changed_files_cmd) + + +@click.command() +@click.pass_context +def fixlint(ctx, **kwargs): + """Autofix all files.""" + ctx.invoke(lint, apply_patches=True) + + +@click.command() +@click.option("-a", "--apply-patches", is_flag=True) +@click.pass_context +def quicklint(ctx, apply_patches, **kwargs): + """Lint changed files.""" + ctx.invoke(lazy_setup_lint) + cmd = LINTRUNNER_BASE_CMD + if apply_patches: + cmd += ["--apply-patches"] + spin.util.run(cmd) + + +@click.command() +@click.pass_context +def quickfix(ctx, **kwargs): + """Autofix changed files.""" + ctx.invoke(quicklint, apply_patches=True) diff --git a/CMakeLists.txt b/CMakeLists.txt index ca1e4164be9b8..0e020abda3925 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -234,7 +234,17 @@ option(USE_COLORIZE_OUTPUT "Colorize output during compilation" ON) option(USE_ASAN "Use Address+Undefined Sanitizers" OFF) option(USE_LSAN "Use Leak Sanitizer" OFF) option(USE_TSAN "Use Thread Sanitizer" OFF) + +# Track whether USE_CUDA was explicitly set by the user (before option() is called) +# If USE_CUDA is already defined in cache, it means user explicitly set it +if(DEFINED CACHE{USE_CUDA}) + set(_USE_CUDA_EXPLICITLY_SET TRUE) +else() + set(_USE_CUDA_EXPLICITLY_SET FALSE) +endif() + option(USE_CUDA "Use CUDA" ON) + option(USE_XPU "Use XPU" ON) cmake_dependent_option( BUILD_LAZY_CUDA_LINALG "Build cuda linalg ops as separate library" ON @@ -726,6 +736,44 @@ if(NOT DEFINED USE_BLAS) set(USE_BLAS ON) endif() +# Prioritized Text Linker Optimization +if(USE_PRIORITIZED_TEXT_FOR_LD) + + set(LINKER_SCRIPT_FILE_IN "${CMAKE_SOURCE_DIR}/cmake/prioritized_text.txt") + set(LINKER_SCRIPT_FILE_OUT "${CMAKE_SOURCE_DIR}/cmake/linker_script.ld") + + execute_process( + COMMAND ${Python_EXECUTABLE} + ${CMAKE_SOURCE_DIR}/tools/setup_helpers/generate_linker_script.py + --filein "${LINKER_SCRIPT_FILE_IN}" + --fout "${LINKER_SCRIPT_FILE_OUT}" + RESULT_VARIABLE _gen_result + OUTPUT_VARIABLE _gen_output + ERROR_VARIABLE _gen_error + ) + + if(NOT _gen_result EQUAL 0) + message(FATAL_ERROR + "Failed to generate linker script:\n${_gen_output}\n${_gen_error}") + endif() + + append_cxx_flag_if_supported("-ffunction-sections" CMAKE_CXX_FLAGS) + append_cxx_flag_if_supported("-fdata-sections" CMAKE_CXX_FLAGS) + append_c_flag_if_supported("-ffunction-sections" CMAKE_C_FLAGS) + append_c_flag_if_supported("-fdata-sections" CMAKE_C_FLAGS) + + set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -T${LINKER_SCRIPT_FILE_OUT}") + set(CMAKE_MODULE_LINKER_FLAGS "${CMAKE_MODULE_LINKER_FLAGS} -T${LINKER_SCRIPT_FILE_OUT}") + +else() + if(LINUX AND CPU_AARCH64) + message(WARNING [[ + It is strongly recommend to enable linker script optimization for all AArch64 Linux builds. + To do so please export USE_PRIORITIZED_TEXT_FOR_LD=1 + ]]) + endif() +endif() + # Build libtorch mobile library, which contains ATen/TH ops and native support # for TorchScript model, but doesn't contain not-yet-unified caffe2 ops; if(INTERN_BUILD_MOBILE) @@ -1392,9 +1440,6 @@ if(BUILD_JNI) add_subdirectory(android/pytorch_android) endif() -include(cmake/Summary.cmake) -caffe2_print_configuration_summary() - # Parse custom debug info if(DEFINED USE_CUSTOM_DEBINFO) string(REPLACE ";" " " SOURCE_FILES "${USE_CUSTOM_DEBINFO}") @@ -1434,56 +1479,5 @@ if(BUILD_BUNDLE_PTXAS AND USE_CUDA) DESTINATION "${CMAKE_INSTALL_BINDIR}") endif() -if(USE_PRIORITIZED_TEXT_FOR_LD) - add_compile_options( - $<$:-ffunction-sections> - $<$:-fdata-sections> - ) - set(LINKER_SCRIPT_FILE_OUT "${CMAKE_SOURCE_DIR}/cmake/linker_script.ld") - set(LINKER_SCRIPT_FILE_IN "${CMAKE_SOURCE_DIR}/cmake/prioritized_text.txt") - - add_custom_command( - OUTPUT "${LINKER_SCRIPT_FILE_OUT}" - COMMAND ${Python_EXECUTABLE} ${CMAKE_SOURCE_DIR}/tools/setup_helpers/generate_linker_script.py --filein "${LINKER_SCRIPT_FILE_IN}" --fout "${LINKER_SCRIPT_FILE_OUT}" - DEPENDS ${CMAKE_SOURCE_DIR}/tools/setup_helpers/generate_linker_script.py "${LINKER_SCRIPT_FILE_IN}" - COMMENT "Generating prioritized text linker files" - VERBATIM - ) - - add_custom_target(generate_linker_script DEPENDS "${LINKER_SCRIPT_FILE_OUT}") - - if(BUILD_PYTHON) - set(LINKER_OPT_TARGETS torch_python) - endif() - - if(NOT BUILD_LIBTORCHLESS) - list(APPEND LINKER_OPT_TARGETS torch_cpu c10) - if(USE_CUDA) - list(APPEND LINKER_OPT_TARGETS torch_cuda c10_cuda) - endif() - if(USE_XPU) - list(APPEND LINKER_OPT_TARGETS torch_xpu c10_xpu) - endif() - if(USE_ROCM) - list(APPEND LINKER_OPT_TARGETS torch_hip c10_hip) - endif() - endif() - - foreach(tgt IN LISTS LINKER_OPT_TARGETS) - if(TARGET ${tgt}) - add_dependencies("${tgt}" generate_linker_script) - target_link_options_if_supported(${tgt} "-T,${LINKER_SCRIPT_FILE_OUT}") - set_property(TARGET ${tgt} APPEND PROPERTY LINK_DEPENDS "${LINKER_SCRIPT_FILE_OUT}") - else() - message(WARNING "Requested target '${tgt}' for linker script optimization was not found.") - endif() - endforeach() - -else() - if(LINUX AND CPU_AARCH64) - message(WARNING [[ - It is strongly recommend to enable linker script optimization for all AArch64 Linux builds. - To do so please export USE_PRIORITIZED_TEXT_FOR_LD=1 - ]]) - endif() -endif() +include(cmake/Summary.cmake) +caffe2_print_configuration_summary() diff --git a/CODEOWNERS b/CODEOWNERS index cc249dc4f43a2..137031066090e 100644 --- a/CODEOWNERS +++ b/CODEOWNERS @@ -210,8 +210,12 @@ torch/backends/cudnn/ @eqy @syed-ahmed @Aidyn-A /test/inductor/test_flex_attention.py @drisspg /test/inductor/test_flex_decoding.py @drisspg -# Low Precision GEMMs +# Low Precision & Grouped GEMMs /aten/src/ATen/native/cuda/Blas.cpp @drisspg @slayton58 +/aten/src/ATen/native/cuda/GroupedBlas.cpp @drisspg @slayton58 +/aten/src/ATen/native/cuda/ScaledBlas.cpp @drisspg @slayton58 /aten/src/ATen/cuda/CUDABlas.cpp @drisspg @slayton58 /aten/src/ATen/cuda/CUDABlas.h @drisspg @slayton58 +/aten/src/ATen/cuda/CUDAScaledBlas.cpp @drisspg @slayton58 +/aten/src/ATen/cuda/CUDAScaledBlas.h @drisspg @slayton58 /test/test_scaled_matmul_cuda.py @drisspg @slayton58 diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 9df55ca6acd5c..bc0b0fc9bb00f 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -18,7 +18,7 @@ aspects of contributing to PyTorch. - [Python Unit Testing](#python-unit-testing) - [Better local unit tests with `pytest`](#better-local-unit-tests-with-pytest) - [Local linting](#local-linting) - - [Running `mypy`](#running-mypy) + - [Running `pyrefly`](#running-pyrefly) - [C++ Unit Testing](#c-unit-testing) - [Run Specific CI Jobs](#run-specific-ci-jobs) - [Merging your Change](#merging-your-change) @@ -281,7 +281,7 @@ dependencies as well as the nightly binaries into the repo directory. **Prerequisites**: The following packages should be installed with `pip`: - `expecttest` and `hypothesis` - required to run tests -- `mypy` - recommended for linting +- `pyrefly` - recommended for type checking. [Pyrefly](https://pyrefly.org/) - `pytest` - recommended to run tests more selectively Running ``` @@ -350,15 +350,32 @@ make lint Learn more about the linter on the [lintrunner wiki page](https://github.com/pytorch/pytorch/wiki/lintrunner) -#### Running `mypy` +#### Running `pyrefly` -`mypy` is an optional static type checker for Python. We have multiple `mypy` -configs for the PyTorch codebase that are automatically validated against whenever the linter is run. +[Pyrefly](https://pyrefly.org/) is a high-performance static type checker for Python. It provides fast type checking along with IDE features like autocomplete and instant error feedback. + +PyTorch uses Pyrefly for type checking across the codebase. The configuration is managed in `pyrefly.toml` at the root of the repository. + +**Getting Started with Pyrefly:** + +To run type checking on the PyTorch codebase: +```bash +pyrefly check +``` + +For more detailed error information with summaries: +```bash +pyrefly check --summarize-errors +``` + +**Learn More:** +- [Pyrefly Configuration](https://pyrefly.org/en/docs/configuration/) - Detailed configuration options +- [Pyrefly IDE Features](https://pyrefly.org/en/docs/IDE-features/) - Set up Pyrefly in your editor for real-time type checking +- [Python Typing Tutorial](https://pyrefly.org/en/docs/typing-for-python-developers/) - Learn about Python type annotations See [Guide for adding type annotations to PyTorch](https://github.com/pytorch/pytorch/wiki/Guide-for-adding-type-annotations-to-PyTorch) -for more information on how to set up `mypy` and tackle type annotation -tasks. +for PyTorch-specific guidance on how to set up `pyrefly` and tackle type annotation tasks in this codebase. ### C++ Unit Testing diff --git a/LICENSE b/LICENSE index 966a609b61e53..c23172f7aff02 100644 --- a/LICENSE +++ b/LICENSE @@ -37,7 +37,7 @@ Copyright (c) 2024 Tri Dao. All rights reserved. All contributions by Arm: -Copyright (c) 2021, 2023-2024 Arm Limited and/or its affiliates +Copyright (c) 2021, 2023-2025 Arm Limited and/or its affiliates All contributions from Caffe: Copyright(c) 2013, 2014, 2015, the respective contributors diff --git a/SECURITY.md b/SECURITY.md index ed8228af36724..2d2c8a0c5f1c5 100644 --- a/SECURITY.md +++ b/SECURITY.md @@ -1,7 +1,7 @@ # Security Policy - [**Reporting a Vulnerability**](#reporting-a-vulnerability) - - [**Using Pytorch Securely**](#using-pytorch-securely) + - [**Using PyTorch Securely**](#using-pytorch-securely) - [Untrusted models](#untrusted-models) - [TorchScript models](#torchscript-models) - [Untrusted inputs](#untrusted-inputs) @@ -10,28 +10,30 @@ - [**CI/CD security principles**](#cicd-security-principles) ## Reporting Security Issues -Beware that none of the topics under [Using Pytorch Securely](#using-pytorch-securely) are considered vulnerabilities of Pytorch. +Beware that none of the topics under [Using PyTorch Securely](#using-pytorch-securely) are considered vulnerabilities of PyTorch. However, if you believe you have found a security vulnerability in PyTorch, we encourage you to let us know right away. We will investigate all legitimate reports and do our best to quickly fix the problem. Please report security issues using https://github.com/pytorch/pytorch/security/advisories/new -All reports submitted thru the security advisories mechanism would **either be made public or dismissed by the team within 90 days of the submission**. If advisory has been closed on the grounds that it is not a security issue, please do not hesitate to create an [new issue](https://github.com/pytorch/pytorch/issues/new?template=bug-report.yml) as it is still likely a valid issue within the framework. +All reports submitted through the security advisories mechanism would **either be made public or dismissed by the team within 90 days of the submission**. If advisory has been closed on the grounds that it is not a security issue, please do not hesitate to create an [new issue](https://github.com/pytorch/pytorch/issues/new?template=bug-report.yml) as it is still likely a valid issue within the framework. + +**Note on crashes and out of bounds access**: PyTorch is a computational framework that performs operations on behalf of the caller. Like many low-level libraries, PyTorch generally does not validate all inputs to every function—the responsibility for providing valid arguments lies with the calling code. While crashes and out of bounds memory access should be reported as bugs, they are generally not considered security vulnerabilities in PyTorch's threat model. Please refer to the following page for our responsible disclosure policy, reward guidelines, and those things that should not be reported: https://www.facebook.com/whitehat -## Using Pytorch Securely -**Pytorch models are programs**, so treat its security seriously -- running untrusted models is equivalent to running untrusted code. In general we recommend that model weights and the python code for the model are distributed independently. That said, be careful about where you get the python code from and who wrote it (preferentially check for a provenance or checksums, do not run any pip installed package). +## Using PyTorch Securely +**PyTorch models are programs**, so treat its security seriously -- running untrusted models is equivalent to running untrusted code. In general we recommend that model weights and the python code for the model are distributed independently. That said, be careful about where you get the python code from and who wrote it (preferentially check for a provenance or checksums, do not run any pip installed package). ### Untrusted models Be careful when running untrusted models. This classification includes models created by unknown developers or utilizing data obtained from unknown sources[^data-poisoning-sources]. **Prefer to execute untrusted models within a secure, isolated environment such as a sandbox** (e.g., containers, virtual machines). This helps protect your system from potentially malicious code. You can find further details and instructions in [this page](https://developers.google.com/code-sandboxing). -**Be mindful of risky model formats**. Give preference to share and load weights with the appropriate format for your use case. [safetensors](https://huggingface.co/docs/safetensors/en/index) gives the most safety but is the most restricted in what it supports. [`torch.load`](https://pytorch.org/docs/stable/generated/torch.load.html#torch.load) has a significantly larger surface of attack but is more flexible in what it can serialize. See the documentation for more details. +**Be mindful of risky model formats**. Give preference to share and load weights with the appropriate format for your use case. [Safetensors](https://huggingface.co/docs/safetensors/en/index) gives the most safety but is the most restricted in what it supports. [`torch.load`](https://pytorch.org/docs/stable/generated/torch.load.html#torch.load) has a significantly larger surface of attack but is more flexible in what it can serialize. See the documentation for more details. Even for more secure serialization formats, unexpected inputs to the downstream system can cause diverse security threats (e.g. denial of service, out of bound reads/writes) and thus we recommend extensive validation of any untrusted inputs. @@ -43,7 +45,7 @@ Important Note: The trustworthiness of a model is not binary. You must always de ### TorchScript models -TorchScript models should treated the same way as locally executable code from an unknown source. Only run TorchScript models if you trust the provider. Please note, that tools for introspecting TorchScript models (such as `torch.utils.model_dump`) may also execute partial or full code stored in those models, therefore they should be used only if you trust the provider of the binary you are about to load. +TorchScript models should be treated the same way as locally executable code from an unknown source. Only run TorchScript models if you trust the provider. Please note, that tools for introspecting TorchScript models (such as `torch.utils.model_dump`) may also execute partial or full code stored in those models, therefore they should be used only if you trust the provider of the binary you are about to load. ### Untrusted inputs during training and prediction @@ -59,9 +61,9 @@ If applicable, prepare your model against bad inputs and prompt injections. Some ### Data privacy -**Take special security measures if your model if you train models with sensitive data**. Prioritize [sandboxing](https://developers.google.com/code-sandboxing) your models and: -- Do not feed sensitive data to untrusted model (even if runs in a sandboxed environment) -- If you consider publishing a model that was partially trained with sensitive data, be aware that data can potentially be recovered from the trained weights (especially if model overfits). +**Take special security measures if you train your models with sensitive data**. Prioritize [sandboxing](https://developers.google.com/code-sandboxing) your models and: +- Do not feed sensitive data to an untrusted model (even if runs in a sandboxed environment) +- If you consider publishing a model that was partially trained with sensitive data, be aware that data can potentially be recovered from the trained weights (especially if the model overfits). ### Using distributed features diff --git a/aten/src/ATen/CMakeLists.txt b/aten/src/ATen/CMakeLists.txt index 8b283c417b74b..ae762e1def3ec 100644 --- a/aten/src/ATen/CMakeLists.txt +++ b/aten/src/ATen/CMakeLists.txt @@ -260,7 +260,7 @@ IF(USE_FBGEMM_GENAI) if(USE_CUDA) # To avoid increasing the build time/binary size unnecessarily, use an allow-list of kernels to build. # If you want to integrate a kernel from FBGEMM into torch, you have to add it here. - set(FBGEMM_CUTLASS_KERNELS_REGEX ".*(mx8mx8bf16_grouped|f4f4bf16_grouped).*") + set(FBGEMM_CUTLASS_KERNELS_REGEX ".*(mx8mx8bf16_grouped|f4f4bf16_grouped|f4f4bf16).*") file(GLOB_RECURSE fbgemm_genai_native_cuda_cu "${FBGEMM_GENAI_SRCS}/cutlass_extensions/*.cu" "${FBGEMM_GENAI_SRCS}/cutlass_extensions/**/*.cu") diff --git a/aten/src/ATen/Context.cpp b/aten/src/ATen/Context.cpp index a354b41912406..6bc321887502d 100644 --- a/aten/src/ATen/Context.cpp +++ b/aten/src/ATen/Context.cpp @@ -23,8 +23,6 @@ C10_DIAGNOSTIC_POP() #endif namespace at { -namespace { - /* These const variables defined the fp32 precisions for different backend We have "generic", "cuda", "mkldnn" backend now and we can choose fp32 @@ -41,16 +39,6 @@ namespace { ->rnn */ - C10_ALWAYS_INLINE void warn_deprecated_fp32_precision_api(){ - TORCH_WARN_ONCE( - "Please use the new API settings to control TF32 behavior, such as torch.backends.cudnn.conv.fp32_precision = 'tf32' " - "or torch.backends.cuda.matmul.fp32_precision = 'ieee'. Old settings, e.g, torch.backends.cuda.matmul.allow_tf32 = True, " - "torch.backends.cudnn.allow_tf32 = True, allowTF32CuDNN() and allowTF32CuBLAS() will be deprecated after Pytorch 2.9. Please see " - "https://pytorch.org/docs/main/notes/cuda.html#tensorfloat-32-tf32-on-ampere-and-later-devices" - ); - } -} // namespace - Float32Backend str2backend(const std::string& name) { if (name == "generic") return Float32Backend::GENERIC; @@ -206,7 +194,6 @@ bool Context::allowTF32CuDNN(std::optional op) const { } else { return float32Precision(Float32Backend::CUDA, op.value()) == Float32Precision::TF32; } - warn_deprecated_fp32_precision_api(); return allow_tf32_cudnn; } @@ -214,7 +201,6 @@ void Context::setAllowTF32CuDNN(bool b) { setFloat32Precision(Float32Backend::CUDA, Float32Op::RNN, b ? Float32Precision::TF32 : Float32Precision::NONE); setFloat32Precision(Float32Backend::CUDA, Float32Op::CONV, b ? Float32Precision::TF32 : Float32Precision::NONE); allow_tf32_cudnn = b; - warn_deprecated_fp32_precision_api(); } void Context::setSDPPriorityOrder(const std::vector& order) { @@ -325,7 +311,6 @@ bool Context::allowTF32CuBLAS() const { "Current status indicate that you have used mix of the legacy and new APIs to set the TF32 status for cublas matmul. ", "We suggest only using the new API to set the TF32 flag. See also: ", "https://pytorch.org/docs/main/notes/cuda.html#tensorfloat-32-tf32-on-ampere-and-later-devices"); - warn_deprecated_fp32_precision_api(); return allow_tf32_new; } @@ -349,7 +334,6 @@ Float32MatmulPrecision Context::float32MatmulPrecision() const { "Current status indicate that you have used mix of the legacy and new APIs to set the matmul precision. ", "We suggest only using the new API for matmul precision. See also: ", "https://pytorch.org/docs/main/notes/cuda.html#tensorfloat-32-tf32-on-ampere-and-later-devices"); - warn_deprecated_fp32_precision_api(); return float32_matmul_precision; } @@ -377,7 +361,6 @@ Float32Precision Context::float32Precision(Float32Backend backend, Float32Op op) void Context::setFloat32MatmulPrecision(const std::string &s) { auto match = [this](const std::string & s_) { - warn_deprecated_fp32_precision_api(); // TODO: consider if CuDNN field needs to also be set for potential future CuDNN ops like multi-headed attention if (s_ == "highest") { float32_matmul_precision = at::Float32MatmulPrecision::HIGHEST; diff --git a/aten/src/ATen/Context.h b/aten/src/ATen/Context.h index 6807e527eb75f..385ccb88c463b 100644 --- a/aten/src/ATen/Context.h +++ b/aten/src/ATen/Context.h @@ -174,6 +174,12 @@ class TORCH_API Context { static long versionCuDNN() { return detail::getCUDAHooks().versionCuDNN(); } + static long versionRuntimeCuDNN() { + return detail::getCUDAHooks().versionRuntimeCuDNN(); + } + static long versionCuDNNFrontend() { + return detail::getCUDAHooks().versionCuDNNFrontend(); + } static bool hasCuSOLVER() { return detail::getCUDAHooks().hasCuSOLVER(); } diff --git a/aten/src/ATen/DeviceAccelerator.h b/aten/src/ATen/DeviceAccelerator.h index f23b35047fcc8..2cc4cff7cd1f2 100644 --- a/aten/src/ATen/DeviceAccelerator.h +++ b/aten/src/ATen/DeviceAccelerator.h @@ -94,6 +94,11 @@ TORCH_API inline void resetPeakStats(c10::DeviceIndex device_index) { at::getDeviceAllocator(device_type)->resetPeakStats(device_index); } +TORCH_API inline std::pair getMemoryInfo( + c10::DeviceIndex device_index) { + const auto device_type = getAccelerator(true).value(); + return at::getDeviceAllocator(device_type)->getMemoryInfo(device_index); +} } // namespace at::accelerator namespace at { diff --git a/aten/src/ATen/Dispatch.h b/aten/src/ATen/Dispatch.h index 40ad61cbd6455..870f7172d1622 100644 --- a/aten/src/ATen/Dispatch.h +++ b/aten/src/ATen/Dispatch.h @@ -6,6 +6,7 @@ #include #include #include +#include #ifdef __CUDACC__ #include // For CUDA_VERSION @@ -61,12 +62,9 @@ TORCH_API void record_kernel_function_dtype(std::string name); } \ } while (0) -#define AT_PRIVATE_CASE_TYPE_USING_HINT(enum_type, HINT, ...) \ - case enum_type: { \ - AT_PRIVATE_CHECK_SELECTIVE_BUILD(enum_type); \ - using HINT [[maybe_unused]] = c10::impl::ScalarTypeToCPPTypeT; \ - return __VA_ARGS__(); \ - } +#define AT_PRIVATE_CASE_TYPE_USING_HINT(enum_type, HINT, ...) \ + THO_PRIVATE_CASE_TYPE_USING_HINT_TMPL( \ + AT_PRIVATE_CHECK_SELECTIVE_BUILD, enum_type, HINT, __VA_ARGS__) #define AT_DISPATCH_CASE(enum_type, ...) \ AT_PRIVATE_CASE_TYPE_USING_HINT(enum_type, scalar_t, __VA_ARGS__) @@ -95,14 +93,6 @@ TORCH_API void record_kernel_function_dtype(std::string name); return __VA_ARGS__(); \ } -namespace detail { - -inline at::ScalarType scalar_type(at::ScalarType s) { - return s; -} - -} // namespace detail - // The AT_DISPATCH_* family of macros provides the ability to // conveniently generate specializations of a kernel over all of the // dtypes we care about in PyTorch. We call it "dispatch" because @@ -190,27 +180,13 @@ inline at::ScalarType scalar_type(at::ScalarType s) { // but we're just being safe (and it doesn't hurt.) Note we must // use it to shut up warnings about unused store. -#define AT_DISPATCH_SWITCH(TYPE, NAME, ...) \ - [&] { \ - const auto& the_type = TYPE; \ - constexpr const char* at_dispatch_name = NAME; \ - /* don't use TYPE again in case it is an expensive or side-effect op */ \ - at::ScalarType _st = ::detail::scalar_type(the_type); \ - RECORD_KERNEL_FUNCTION_DTYPE(at_dispatch_name, _st); \ - C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wswitch-enum") \ - switch (_st) { \ - __VA_ARGS__ \ - default: \ - TORCH_CHECK_NOT_IMPLEMENTED( \ - false, \ - '"', \ - at_dispatch_name, \ - "\" not implemented for '", \ - toString(_st), \ - "'"); \ - } \ - C10_DIAGNOSTIC_POP() \ - }() +#define AT_DISPATCH_SWITCH(TYPE, NAME, ...) \ + THO_DISPATCH_SWITCH_TMPL( \ + RECORD_KERNEL_FUNCTION_DTYPE, \ + TORCH_CHECK_NOT_IMPLEMENTED, \ + TYPE, \ + NAME, \ + __VA_ARGS__) #define AT_DISPATCH_CASE_FLOATING_TYPES(...) \ AT_DISPATCH_CASE(at::ScalarType::Double, __VA_ARGS__) \ diff --git a/aten/src/ATen/Dispatch_v2.h b/aten/src/ATen/Dispatch_v2.h index d0b77220faef2..fbeb48d45e32a 100644 --- a/aten/src/ATen/Dispatch_v2.h +++ b/aten/src/ATen/Dispatch_v2.h @@ -1,3 +1,8 @@ +#pragma once + +#include + +// Get AT_DISPATCH_SWITCH and AT_DISPATCH_CASE: #include // This is a new implementation of the AT_DISPATCH macro family from @@ -74,41 +79,19 @@ // macro expansion occurs, mediated with AT_EXPAND and AT_GUARD. I mostly // relied on GPT4 to help me get it right. -// Public API macros - // See documentation above #define AT_DISPATCH_V2(TYPE, NAME, BODY, ...) \ - AT_DISPATCH_SWITCH(TYPE, NAME, AT_AP_VAR(AT_WRAP(BODY), TYPE, __VA_ARGS__)) - -// This macro lets you pass an arbitrary expression that may contain internal -// commas to another macro without having the commas causing the expression -// to be interpreted as being multiple arguments -#define AT_WRAP(...) __VA_ARGS__ - -#define AT_FLOAT8_TYPES \ - c10::kFloat8_e5m2, c10::kFloat8_e5m2fnuz, c10::kFloat8_e4m3fn, \ - c10::kFloat8_e4m3fnuz, c10::kFloat8_e8m0fnu - -#define AT_INTEGRAL_TYPES \ - c10::kByte, c10::kChar, c10::kInt, c10::kLong, c10::kShort -#define AT_FLOATING_TYPES c10::kDouble, c10::kFloat -#define AT_BAREBONES_UNSIGNED_TYPES c10::kUInt16, c10::kUInt32, c10::kUInt64 -#define AT_INTEGRAL_TYPES_V2 \ - AT_EXPAND(AT_INTEGRAL_TYPES), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES) -#define AT_COMPLEX_TYPES c10::kComplexDouble, c10::kComplexFloat -#define AT_QINT_TYPES c10::kQInt8, c10::kQUInt8, c10::kQInt32 -// NB: not *actually* all types -#define AT_ALL_TYPES AT_EXPAND(AT_INTEGRAL_TYPES), AT_EXPAND(AT_FLOATING_TYPES) -#define AT_ALL_TYPES_AND_COMPLEX \ - AT_EXPAND(AT_ALL_TYPES), AT_EXPAND(AT_COMPLEX_TYPES) - -// Helper macros - + THO_DISPATCH_V2_TMPL( \ + AT_DISPATCH_SWITCH, \ + AT_DISPATCH_CASE, \ + TYPE, \ + NAME, \ + AT_WRAP(BODY), \ + __VA_ARGS__) + +// Unused helper macros, kept for BC: #define AT_AP_VAR(N, T, ...) \ AT_EXPAND(AT_CONCAT(AT_AP, AT_NUM_ARGS(__VA_ARGS__))(AT_WRAP(N), __VA_ARGS__)) -#define AT_CONCAT(a, b) AT_CONCAT_AUX(a, b) -#define AT_CONCAT_AUX(a, b) a##b -#define AT_EXPAND(X) X // Ensure we never have too many scalar types for the expansion here to // support. To bump this, you must regenerate the macros below. @@ -119,12 +102,6 @@ static_assert(static_cast(c10::ScalarType::NumOptions) < 60); num_args = 60 -nums = ', '.join(str(i) for i in reversed(range(num_args+1))) -args = ', '.join(f'_{i}' for i in range(1, num_args+1)) - -print(f'#define AT_NUM_ARGS(...) AT_EXPAND(AT_NUM_ARGS_AUX(__VA_ARGS__, {nums}))') -print(f'#define AT_NUM_ARGS_AUX({args}, N, ...) N') - for i in range(1, num_args+1): args = ', '.join(f'_{i}' for i in range(1, i+1)) cases = ' '.join([f'AT_DISPATCH_CASE(_{j}, N)' for j in range(1, i+1)]) @@ -135,8 +112,6 @@ for i in range(1, num_args+1): // Begin generated code // clang-format off -#define AT_NUM_ARGS(...) AT_EXPAND(AT_NUM_ARGS_AUX(__VA_ARGS__, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)) -#define AT_NUM_ARGS_AUX(_1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27, _28, _29, _30, _31, _32, _33, _34, _35, _36, _37, _38, _39, _40, _41, _42, _43, _44, _45, _46, _47, _48, _49, _50, _51, _52, _53, _54, _55, _56, _57, _58, _59, _60, N, ...) N #define AT_AP1(N, _1) AT_DISPATCH_CASE(_1, N) #define AT_AP2(N, _1, _2) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) #define AT_AP3(N, _1, _2, _3) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) diff --git a/aten/src/ATen/LegacyBatchedTensorImpl.h b/aten/src/ATen/LegacyBatchedTensorImpl.h index 798e3535af3fb..f051e7b1f6531 100644 --- a/aten/src/ATen/LegacyBatchedTensorImpl.h +++ b/aten/src/ATen/LegacyBatchedTensorImpl.h @@ -144,7 +144,7 @@ inline std::bitset createVmapLevelsBitset(BatchDimsRef bdims) { } inline std::ostream& operator<<(std::ostream& out, const BatchDim& bdim) { - out << "(lvl=" << bdim.level() << ", dim=" << bdim.dim() << ")"; + out << "(lvl=" << bdim.level() << ", dim=" << bdim.dim() << ')'; return out; } diff --git a/aten/src/ATen/TensorIndexing.cpp b/aten/src/ATen/TensorIndexing.cpp index 1fa852686656f..8618a67259c9c 100644 --- a/aten/src/ATen/TensorIndexing.cpp +++ b/aten/src/ATen/TensorIndexing.cpp @@ -9,7 +9,7 @@ namespace indexing { const EllipsisIndexType Ellipsis = EllipsisIndexType(); std::ostream& operator<<(std::ostream& stream, const Slice& slice) { - stream << slice.start() << ":" << slice.stop() << ":" << slice.step(); + stream << slice.start() << ':' << slice.stop() << ':' << slice.step(); return stream; } @@ -31,12 +31,12 @@ std::ostream& operator<<(std::ostream& stream, const TensorIndex& tensor_index) } std::ostream& operator<<(std::ostream& stream, const std::vector& tensor_indices) { - stream << "("; + stream << '('; for (const auto i : c10::irange(tensor_indices.size())) { stream << tensor_indices[i]; if (i < tensor_indices.size() - 1) stream << ", "; } - stream << ")"; + stream << ')'; return stream; } diff --git a/aten/src/ATen/TensorNames.cpp b/aten/src/ATen/TensorNames.cpp index bff12aa8de65f..ac6857b95c1d6 100644 --- a/aten/src/ATen/TensorNames.cpp +++ b/aten/src/ATen/TensorNames.cpp @@ -113,7 +113,7 @@ void TensorNames::checkUnique(const char* op_name) const { std::ostream& operator<<(std::ostream& out, const TensorName& tensorname) { out << tensorname.name_ << " (index "; out << tensorname.origin_idx_ << " of "; - out << tensorname.origin_ << ")"; + out << tensorname.origin_ << ')'; return out; } diff --git a/aten/src/ATen/TensorUtils.cpp b/aten/src/ATen/TensorUtils.cpp index 8236751679f06..2752ff792e485 100644 --- a/aten/src/ATen/TensorUtils.cpp +++ b/aten/src/ATen/TensorUtils.cpp @@ -13,9 +13,9 @@ std::ostream& operator<<(std::ostream & out, const TensorGeometryArg& t) { if (t.pos == 0) { // 0 is distinguished; it usually indicates 'self' or the return // tensor - out << "'" << t.name << "'"; + out << '\'' << t.name << '\''; } else { - out << "argument #" << t.pos << " '" << t.name << "'"; + out << "argument #" << t.pos << " '" << t.name << '\''; } return out; } @@ -154,7 +154,7 @@ void checkSameGPU(CheckedFrom c, const TensorArg& t1, const TensorArg& t2) { oss << "Tensor for " << t2 << " is on CPU, "; } oss << "but expected " << ((!t1->is_cpu() && !t2->is_cpu()) ? "them" : "it") - << " to be on GPU (while checking arguments for " << c << ")"; + << " to be on GPU (while checking arguments for " << c << ')'; TORCH_CHECK(false, oss.str()); } TORCH_CHECK( @@ -199,7 +199,7 @@ void checkScalarTypes(CheckedFrom c, const TensorArg& t, i++; } oss << "; but got " << t->toString() - << " instead (while checking arguments for " << c << ")"; + << " instead (while checking arguments for " << c << ')'; TORCH_CHECK(false, oss.str()); } } diff --git a/aten/src/ATen/Version.cpp b/aten/src/ATen/Version.cpp index 7239f357fdd64..a6335d9e11304 100644 --- a/aten/src/ATen/Version.cpp +++ b/aten/src/ATen/Version.cpp @@ -43,8 +43,8 @@ std::string get_mkldnn_version() { // https://github.com/intel/ideep/issues/29 { const dnnl_version_t* ver = dnnl_version(); - ss << "Intel(R) MKL-DNN v" << ver->major << "." << ver->minor << "." << ver->patch - << " (Git Hash " << ver->hash << ")"; + ss << "Intel(R) MKL-DNN v" << ver->major << '.' << ver->minor << '.' << ver->patch + << " (Git Hash " << ver->hash << ')'; } #else ss << "MKLDNN not found"; @@ -81,7 +81,7 @@ std::string get_openmp_version() { break; } if (ver_str) { - ss << " (a.k.a. OpenMP " << ver_str << ")"; + ss << " (a.k.a. OpenMP " << ver_str << ')'; } } #else @@ -135,38 +135,38 @@ std::string show_config() { #if defined(__GNUC__) { - ss << " - GCC " << __GNUC__ << "." << __GNUC_MINOR__ << "\n"; + ss << " - GCC " << __GNUC__ << '.' << __GNUC_MINOR__ << '\n'; } #endif #if defined(__cplusplus) { - ss << " - C++ Version: " << __cplusplus << "\n"; + ss << " - C++ Version: " << __cplusplus << '\n'; } #endif #if defined(__clang_major__) { - ss << " - clang " << __clang_major__ << "." << __clang_minor__ << "." << __clang_patchlevel__ << "\n"; + ss << " - clang " << __clang_major__ << '.' << __clang_minor__ << '.' << __clang_patchlevel__ << '\n'; } #endif #if defined(_MSC_VER) { - ss << " - MSVC " << _MSC_FULL_VER << "\n"; + ss << " - MSVC " << _MSC_FULL_VER << '\n'; } #endif #if AT_MKL_ENABLED() - ss << " - " << get_mkl_version() << "\n"; + ss << " - " << get_mkl_version() << '\n'; #endif #if AT_MKLDNN_ENABLED() - ss << " - " << get_mkldnn_version() << "\n"; + ss << " - " << get_mkldnn_version() << '\n'; #endif #ifdef _OPENMP - ss << " - " << get_openmp_version() << "\n"; + ss << " - " << get_openmp_version() << '\n'; #endif #if AT_BUILD_WITH_LAPACK() @@ -183,7 +183,7 @@ std::string show_config() { ss << " - Cross compiling on MacOSX\n"; #endif - ss << " - "<< used_cpu_capability() << "\n"; + ss << " - "<< used_cpu_capability() << '\n'; if (hasCUDA()) { ss << detail::getCUDAHooks().showConfig(); @@ -200,10 +200,10 @@ std::string show_config() { ss << " - Build settings: "; for (const auto& pair : caffe2::GetBuildOptions()) { if (!pair.second.empty()) { - ss << pair.first << "=" << pair.second << ", "; + ss << pair.first << '=' << pair.second << ", "; } } - ss << "\n"; + ss << '\n'; // TODO: do HIP // TODO: do XLA diff --git a/aten/src/ATen/code_template.h b/aten/src/ATen/code_template.h index 2026795fc0a3d..2cde802dac172 100644 --- a/aten/src/ATen/code_template.h +++ b/aten/src/ATen/code_template.h @@ -209,7 +209,7 @@ struct CodeTemplate { // to indent correctly in the context. void emitIndent(std::ostream& out, size_t indent) const { for ([[maybe_unused]] const auto i : c10::irange(indent)) { - out << " "; + out << ' '; } } void emitStringWithIndents( diff --git a/aten/src/ATen/core/CachingHostAllocator.h b/aten/src/ATen/core/CachingHostAllocator.h index 603e7e73bc1ea..71af40c5fd20a 100644 --- a/aten/src/ATen/core/CachingHostAllocator.h +++ b/aten/src/ATen/core/CachingHostAllocator.h @@ -226,8 +226,8 @@ template < typename B = HostBlock> struct CachingHostAllocatorImpl { virtual ~CachingHostAllocatorImpl() { - active_ = false; - if (pinned_use_background_threads()) { + if (active_) { + active_ = false; getBackgroundThreadPool()->waitWorkComplete(); } } @@ -260,6 +260,7 @@ struct CachingHostAllocatorImpl { if (pinned_use_background_threads()) { // Launch the background thread and process events in a loop. static bool background_thread_flag [[maybe_unused]] = [this] { + active_ = true; getBackgroundThreadPool()->run([&]() { while (active_) { process_events(); @@ -683,9 +684,9 @@ struct CachingHostAllocatorImpl { alignas(hardware_destructive_interference_size) std::mutex events_mutex_; std::deque> events_; // event queue paired with block - // Indicates whether the object is active. + // Indicates whether the event-processing thread pool is active. // Set to false in the destructor to signal background threads to stop. - std::atomic active_{true}; + std::atomic active_{false}; protected: alignas(hardware_destructive_interference_size) HostStatsStaged stats_; }; diff --git a/aten/src/ATen/core/Dimname.cpp b/aten/src/ATen/core/Dimname.cpp index c78d554732b9e..66aa8cb69e1ed 100644 --- a/aten/src/ATen/core/Dimname.cpp +++ b/aten/src/ATen/core/Dimname.cpp @@ -10,7 +10,7 @@ std::ostream& operator<<(std::ostream& out, const Dimname& dimname) { if (dimname.type() == NameType::WILDCARD) { out << "None"; } else { - out << "'" << dimname.symbol().toUnqualString() << "'"; + out << '\'' << dimname.symbol().toUnqualString() << '\''; } return out; } diff --git a/aten/src/ATen/core/Range.cpp b/aten/src/ATen/core/Range.cpp index 06a79a9c7d063..b5f4c7b6f85bc 100644 --- a/aten/src/ATen/core/Range.cpp +++ b/aten/src/ATen/core/Range.cpp @@ -5,7 +5,7 @@ namespace at { std::ostream& operator<<(std::ostream& out, const Range& range) { - out << "Range[" << range.begin << ", " << range.end << "]"; + out << "Range[" << range.begin << ", " << range.end << ']'; return out; } diff --git a/aten/src/ATen/core/Tensor.cpp b/aten/src/ATen/core/Tensor.cpp index c5f887f096cd1..090e77e703736 100644 --- a/aten/src/ATen/core/Tensor.cpp +++ b/aten/src/ATen/core/Tensor.cpp @@ -71,7 +71,7 @@ void TensorBase::enforce_invariants() { void TensorBase::print() const { if (defined()) { - std::cerr << "[" << toString() << " " << sizes() << "]" << '\n'; + std::cerr << '[' << toString() << ' ' << sizes() << ']' << '\n'; } else { std::cerr << "[UndefinedTensor]" << '\n'; } diff --git a/aten/src/ATen/core/TensorAccessor.h b/aten/src/ATen/core/TensorAccessor.h index 8cf57d2b646fe..d6421bcced0a8 100644 --- a/aten/src/ATen/core/TensorAccessor.h +++ b/aten/src/ATen/core/TensorAccessor.h @@ -1,5 +1,6 @@ #pragma once +#include #include #include #include @@ -11,252 +12,37 @@ namespace at { -// The PtrTraits argument to the TensorAccessor/GenericPackedTensorAccessor -// is used to enable the __restrict__ keyword/modifier for the data -// passed to cuda. -template -struct DefaultPtrTraits { - typedef T* PtrType; -}; - +using torch::headeronly::DefaultPtrTraits; #if defined(__CUDACC__) || defined(__HIPCC__) -template -struct RestrictPtrTraits { - typedef T* __restrict__ PtrType; -}; + using torch::headeronly::RestrictPtrTraits; #endif -// TensorAccessorBase and TensorAccessor are used for both CPU and CUDA tensors. -// For CUDA tensors it is used in device code (only). This means that we restrict ourselves -// to functions and types available there (e.g. IntArrayRef isn't). - -// The PtrTraits argument is only relevant to cuda to support `__restrict__` pointers. -template class PtrTraits = DefaultPtrTraits, typename index_t = int64_t> -class TensorAccessorBase { -public: - typedef typename PtrTraits::PtrType PtrType; - - C10_HOST_DEVICE TensorAccessorBase( - PtrType data_, - const index_t* sizes_, - const index_t* strides_) - : data_(data_), sizes_(sizes_), strides_(strides_) {} - C10_HOST IntArrayRef sizes() const { - return IntArrayRef(sizes_,N); - } - C10_HOST IntArrayRef strides() const { - return IntArrayRef(strides_,N); - } - C10_HOST_DEVICE index_t stride(index_t i) const { - return strides_[i]; - } - C10_HOST_DEVICE index_t size(index_t i) const { - return sizes_[i]; - } - C10_HOST_DEVICE PtrType data() { - return data_; - } - C10_HOST_DEVICE const PtrType data() const { - return data_; - } -protected: - PtrType data_; - const index_t* sizes_; - const index_t* strides_; -}; - -// The `TensorAccessor` is typically instantiated for CPU `Tensor`s using -// `Tensor.accessor()`. -// For CUDA `Tensor`s, `GenericPackedTensorAccessor` is used on the host and only -// indexing on the device uses `TensorAccessor`s. template class PtrTraits = DefaultPtrTraits, typename index_t = int64_t> -class TensorAccessor : public TensorAccessorBase { -public: - typedef typename PtrTraits::PtrType PtrType; - - C10_HOST_DEVICE TensorAccessor( - PtrType data_, - const index_t* sizes_, - const index_t* strides_) - : TensorAccessorBase(data_,sizes_,strides_) {} - - C10_HOST_DEVICE TensorAccessor operator[](index_t i) { - return TensorAccessor(this->data_ + this->strides_[0]*i,this->sizes_+1,this->strides_+1); - } - - C10_HOST_DEVICE const TensorAccessor operator[](index_t i) const { - return TensorAccessor(this->data_ + this->strides_[0]*i,this->sizes_+1,this->strides_+1); - } -}; - -template class PtrTraits, typename index_t> -class TensorAccessor : public TensorAccessorBase { -public: - typedef typename PtrTraits::PtrType PtrType; - - C10_HOST_DEVICE TensorAccessor( - PtrType data_, - const index_t* sizes_, - const index_t* strides_) - : TensorAccessorBase(data_,sizes_,strides_) {} - C10_HOST_DEVICE T & operator[](index_t i) { - // NOLINTNEXTLINE(clang-analyzer-core.NullDereference) - return this->data_[this->strides_[0]*i]; - } - C10_HOST_DEVICE const T & operator[](index_t i) const { - return this->data_[this->strides_[0]*i]; - } -}; +using TensorAccessorBase = torch::headeronly::detail::TensorAccessorBase; - -// GenericPackedTensorAccessorBase and GenericPackedTensorAccessor are used on for CUDA `Tensor`s on the host -// and as -// In contrast to `TensorAccessor`s, they copy the strides and sizes on instantiation (on the host) -// in order to transfer them on the device when calling kernels. -// On the device, indexing of multidimensional tensors gives to `TensorAccessor`s. -// Use RestrictPtrTraits as PtrTraits if you want the tensor's data pointer to be marked as __restrict__. -// Instantiation from data, sizes, strides is only needed on the host and std::copy isn't available -// on the device, so those functions are host only. template class PtrTraits = DefaultPtrTraits, typename index_t = int64_t> -class GenericPackedTensorAccessorBase { -public: - typedef typename PtrTraits::PtrType PtrType; - C10_HOST GenericPackedTensorAccessorBase( - PtrType data_, - const index_t* sizes_, - const index_t* strides_) - : data_(data_) { - std::copy(sizes_, sizes_ + N, std::begin(this->sizes_)); - std::copy(strides_, strides_ + N, std::begin(this->strides_)); - } +using TensorAccessor = torch::headeronly::detail::TensorAccessor; - // if index_t is not int64_t, we want to have an int64_t constructor - template >> - C10_HOST GenericPackedTensorAccessorBase( - PtrType data_, - const source_index_t* sizes_, - const source_index_t* strides_) - : data_(data_) { - for (const auto i : c10::irange(N)) { - this->sizes_[i] = sizes_[i]; - this->strides_[i] = strides_[i]; - } - } +namespace detail { - C10_HOST_DEVICE index_t stride(index_t i) const { - return strides_[i]; - } - C10_HOST_DEVICE index_t size(index_t i) const { - return sizes_[i]; - } - C10_HOST_DEVICE PtrType data() { - return data_; - } - C10_HOST_DEVICE const PtrType data() const { - return data_; - } -protected: - PtrType data_; - // NOLINTNEXTLINE(*c-arrays*) - index_t sizes_[N]; - // NOLINTNEXTLINE(*c-arrays*) - index_t strides_[N]; - C10_HOST void bounds_check_(index_t i) const { - TORCH_CHECK_INDEX( +template +struct IndexBoundsCheck { + IndexBoundsCheck(index_t i) { + TORCH_CHECK_INDEX( 0 <= i && i < index_t{N}, "Index ", i, " is not within bounds of a tensor of dimension ", N); - } + } }; +} // namespace detail template class PtrTraits = DefaultPtrTraits, typename index_t = int64_t> -class GenericPackedTensorAccessor : public GenericPackedTensorAccessorBase { -public: - typedef typename PtrTraits::PtrType PtrType; - - C10_HOST GenericPackedTensorAccessor( - PtrType data_, - const index_t* sizes_, - const index_t* strides_) - : GenericPackedTensorAccessorBase(data_, sizes_, strides_) {} - - // if index_t is not int64_t, we want to have an int64_t constructor - template >> - C10_HOST GenericPackedTensorAccessor( - PtrType data_, - const source_index_t* sizes_, - const source_index_t* strides_) - : GenericPackedTensorAccessorBase(data_, sizes_, strides_) {} - - C10_DEVICE TensorAccessor operator[](index_t i) { - index_t* new_sizes = this->sizes_ + 1; - index_t* new_strides = this->strides_ + 1; - return TensorAccessor(this->data_ + this->strides_[0]*i, new_sizes, new_strides); - } - - C10_DEVICE const TensorAccessor operator[](index_t i) const { - const index_t* new_sizes = this->sizes_ + 1; - const index_t* new_strides = this->strides_ + 1; - return TensorAccessor(this->data_ + this->strides_[0]*i, new_sizes, new_strides); - } - - /// Returns a PackedTensorAccessor of the same dimension after transposing the - /// two dimensions given. Does not actually move elements; transposition is - /// made by permuting the size/stride arrays. If the dimensions are not valid, - /// asserts. - C10_HOST GenericPackedTensorAccessor transpose( - index_t dim1, - index_t dim2) const { - this->bounds_check_(dim1); - this->bounds_check_(dim2); - GenericPackedTensorAccessor result( - this->data_, this->sizes_, this->strides_); - std::swap(result.strides_[dim1], result.strides_[dim2]); - std::swap(result.sizes_[dim1], result.sizes_[dim2]); - return result; - } -}; - -template class PtrTraits, typename index_t> -class GenericPackedTensorAccessor : public GenericPackedTensorAccessorBase { -public: - typedef typename PtrTraits::PtrType PtrType; - C10_HOST GenericPackedTensorAccessor( - PtrType data_, - const index_t* sizes_, - const index_t* strides_) - : GenericPackedTensorAccessorBase(data_, sizes_, strides_) {} - - // if index_t is not int64_t, we want to have an int64_t constructor - template >> - C10_HOST GenericPackedTensorAccessor( - PtrType data_, - const source_index_t* sizes_, - const source_index_t* strides_) - : GenericPackedTensorAccessorBase(data_, sizes_, strides_) {} - - C10_DEVICE T & operator[](index_t i) { - return this->data_[this->strides_[0] * i]; - } - C10_DEVICE const T& operator[](index_t i) const { - return this->data_[this->strides_[0]*i]; - } - - // Same as in the general N-dimensional case, but note that in the - // 1-dimensional case the returned PackedTensorAccessor will always be an - // identical copy of the original - C10_HOST GenericPackedTensorAccessor transpose( - index_t dim1, - index_t dim2) const { - this->bounds_check_(dim1); - this->bounds_check_(dim2); - return GenericPackedTensorAccessor( - this->data_, this->sizes_, this->strides_); - } -}; +using GenericPackedTensorAccessorBase = torch::headeronly::detail::GenericPackedTensorAccessorBase, T, N, PtrTraits, index_t>; +template class PtrTraits = DefaultPtrTraits, typename index_t = int64_t> +using GenericPackedTensorAccessor = torch::headeronly::detail::GenericPackedTensorAccessor, detail::IndexBoundsCheck, T, N, PtrTraits, index_t>; // Can't put this directly into the macro function args because of commas #define AT_X GenericPackedTensorAccessor diff --git a/aten/src/ATen/core/TensorBase.h b/aten/src/ATen/core/TensorBase.h index 2b9558197bdcb..2d7ca10433d6a 100644 --- a/aten/src/ATen/core/TensorBase.h +++ b/aten/src/ATen/core/TensorBase.h @@ -245,6 +245,9 @@ class TORCH_API TensorBase { size_t weak_use_count() const noexcept { return impl_.weak_use_count(); } + bool is_uniquely_owned() const noexcept { + return impl_.is_uniquely_owned(); + } std::string toString() const; diff --git a/aten/src/ATen/core/Vitals.cpp b/aten/src/ATen/core/Vitals.cpp index 1cfc720aca52b..ac1ee45d58345 100644 --- a/aten/src/ATen/core/Vitals.cpp +++ b/aten/src/ATen/core/Vitals.cpp @@ -9,8 +9,8 @@ APIVitals VitalsAPI; std::ostream& operator<<(std::ostream& os, TorchVital const& tv) { for (const auto& m : tv.attrs) { - os << "[TORCH_VITAL] " << tv.name << "." << m.first << "\t\t " - << m.second.value << "\n"; + os << "[TORCH_VITAL] " << tv.name << '.' << m.first << "\t\t " + << m.second.value << '\n'; } return os; } diff --git a/aten/src/ATen/core/alias_info.h b/aten/src/ATen/core/alias_info.h index bf0ff6ee72d3b..6a3335c328be2 100644 --- a/aten/src/ATen/core/alias_info.h +++ b/aten/src/ATen/core/alias_info.h @@ -100,18 +100,18 @@ inline bool operator==(const AliasInfo& lhs, const AliasInfo& rhs) { // this does match the way things are represented in the schema inline std::ostream& operator<<(std::ostream& out, const AliasInfo& aliasInfo) { - out << "("; + out << '('; bool first = true; for (const auto& set : aliasInfo.beforeSets()) { if (first) { first = false; } else { - out << "|"; + out << '|'; } out << set.toUnqualString(); } if (aliasInfo.isWrite()) { - out << "!"; + out << '!'; } if (aliasInfo.beforeSets() != aliasInfo.afterSets()) { out << " -> "; @@ -120,12 +120,12 @@ inline std::ostream& operator<<(std::ostream& out, const AliasInfo& aliasInfo) { if (first) { first = false; } else { - out << "|"; + out << '|'; } out << set.toUnqualString(); } } - out << ")"; + out << ')'; return out; } } // namespace c10 diff --git a/aten/src/ATen/core/blob.h b/aten/src/ATen/core/blob.h index 251da65e0896f..617d6a982ab4e 100644 --- a/aten/src/ATen/core/blob.h +++ b/aten/src/ATen/core/blob.h @@ -198,7 +198,7 @@ inline void swap(Blob& lhs, Blob& rhs) noexcept { } inline std::ostream& operator<<(std::ostream& out, const Blob& v) { - return out << "Blob[" << v.TypeName() << "]"; + return out << "Blob[" << v.TypeName() << ']'; } } // namespace caffe2 diff --git a/aten/src/ATen/core/class_type.cpp b/aten/src/ATen/core/class_type.cpp index 800d9ea0ef9f6..a65124e80979e 100644 --- a/aten/src/ATen/core/class_type.cpp +++ b/aten/src/ATen/core/class_type.cpp @@ -456,8 +456,8 @@ bool ClassType::isSubtypeOfExt(const Type& rhs, std::ostream* why_not) const { *why_not << "Method on class '" << repr_str() << "' (1) is not compatible with interface '" << rhs.repr_str() << "' (2)\n" - << " (1) " << self_method->getSchema() << "\n" - << " (2) " << schema << "\n"; + << " (1) " << self_method->getSchema() << '\n' + << " (2) " << schema << '\n'; } return false; } diff --git a/aten/src/ATen/core/class_type.h b/aten/src/ATen/core/class_type.h index ea537400ef73d..f6f6bade9c90d 100644 --- a/aten/src/ATen/core/class_type.h +++ b/aten/src/ATen/core/class_type.h @@ -100,7 +100,7 @@ struct TORCH_API ClassType : public NamedType { std::string repr_str() const override { std::stringstream ss; ss << str() - << " (of Python compilation unit at: " << compilation_unit().get() << ")"; + << " (of Python compilation unit at: " << compilation_unit().get() << ')'; return ss.str(); } diff --git a/aten/src/ATen/core/dispatch/DispatchKeyExtractor.cpp b/aten/src/ATen/core/dispatch/DispatchKeyExtractor.cpp index 9180d0d19e644..369bd374747ad 100644 --- a/aten/src/ATen/core/dispatch/DispatchKeyExtractor.cpp +++ b/aten/src/ATen/core/dispatch/DispatchKeyExtractor.cpp @@ -58,12 +58,12 @@ std::string DispatchKeyExtractor::dumpState() const { std::ostringstream oss; for (const auto i : c10::irange(c10::utils::bitset::NUM_BITS())) { if (dispatch_arg_indices_reverse_.get(i)) { - oss << "1"; + oss << '1'; } else { - oss << "0"; + oss << '0'; } } - oss << " " << nonFallthroughKeys_ << "\n"; + oss << ' ' << nonFallthroughKeys_ << '\n'; return oss.str(); } diff --git a/aten/src/ATen/core/dispatch/Dispatcher.cpp b/aten/src/ATen/core/dispatch/Dispatcher.cpp index afcaf51f231ae..5facca30a54f3 100644 --- a/aten/src/ATen/core/dispatch/Dispatcher.cpp +++ b/aten/src/ATen/core/dispatch/Dispatcher.cpp @@ -69,8 +69,8 @@ class RegistrationListenerList final { void _print_dispatch_trace(const std::string& label, const std::string& op_name, const DispatchKeySet& dispatchKeySet) { auto nesting_value = dispatch_trace_nesting_value(); - for (int64_t i = 0; i < nesting_value; ++i) std::cerr << " "; - std::cerr << label << " op=[" << op_name << "], key=[" << toString(dispatchKeySet.highestPriorityTypeId()) << "]" << std::endl; + for (int64_t i = 0; i < nesting_value; ++i) std::cerr << ' '; + std::cerr << label << " op=[" << op_name << "], key=[" << toString(dispatchKeySet.highestPriorityTypeId()) << ']' << std::endl; } } // namespace detail diff --git a/aten/src/ATen/core/dispatch/OperatorEntry.cpp b/aten/src/ATen/core/dispatch/OperatorEntry.cpp index 928474ec3336d..e2627354971a0 100644 --- a/aten/src/ATen/core/dispatch/OperatorEntry.cpp +++ b/aten/src/ATen/core/dispatch/OperatorEntry.cpp @@ -570,7 +570,7 @@ void OperatorEntry::checkInvariants() const { std::string OperatorEntry::listAllDispatchKeys() const { std::ostringstream str; - str << "["; + str << '['; bool has_kernels = false; for (auto k : allDispatchKeysInFullSet()) { @@ -584,7 +584,7 @@ std::string OperatorEntry::listAllDispatchKeys() const { str << k; has_kernels = true; } - str << "]"; + str << ']'; return str.str(); } @@ -683,12 +683,12 @@ void OperatorEntry::setReportErrorCallback_(std::unique_ptr c // This WON'T report backend fallbacks. std::string OperatorEntry::dumpState() const { std::ostringstream oss; - oss << "name: " << name_ << "\n"; + oss << "name: " << name_ << '\n'; if (schema_) { - oss << "schema: " << schema_->schema << "\n"; - oss << "debug: " << schema_->debug << "\n"; + oss << "schema: " << schema_->schema << '\n'; + oss << "debug: " << schema_->debug << '\n'; oss << "alias analysis kind: " << toString(schema_->schema.aliasAnalysis()) - << (schema_->schema.isDefaultAliasAnalysisKind() ? " (default)" : "") << "\n"; + << (schema_->schema.isDefaultAliasAnalysisKind() ? " (default)" : "") << '\n'; } else { oss << "schema: (none)\n"; } diff --git a/aten/src/ATen/core/function_schema.cpp b/aten/src/ATen/core/function_schema.cpp index 6587af0f9ccc0..ffccbe282ddd2 100644 --- a/aten/src/ATen/core/function_schema.cpp +++ b/aten/src/ATen/core/function_schema.cpp @@ -7,7 +7,7 @@ namespace c10 { void FunctionSchema::dump() const { - std::cout << *this << "\n"; + std::cout << *this << '\n'; } const std::vector& FunctionSchema::getCorrectList(SchemaArgType type) const { @@ -210,9 +210,9 @@ std::ostream& operator<<(std::ostream& out, const FunctionSchema& schema) { out << schema.name(); if (!schema.overload_name().empty()) { - out << "." << schema.overload_name(); + out << '.' << schema.overload_name(); } - out << "("; + out << '('; bool seen_kwarg_only = false; for (const auto i : c10::irange(schema.arguments().size())) { @@ -273,7 +273,7 @@ std::ostream& operator<<(std::ostream& out, const FunctionSchema& schema) { } if (need_paren) { - out << "("; + out << '('; } for (const auto i : c10::irange(returns.size())) { if (i > 0) { @@ -288,7 +288,7 @@ std::ostream& operator<<(std::ostream& out, const FunctionSchema& schema) { out << "..."; } if (need_paren) { - out << ")"; + out << ')'; } return out; } @@ -471,7 +471,7 @@ bool FunctionSchema::isForwardCompatibleWith( if (!arguments().at(i).isForwardCompatibleWith(old.arguments().at(i))) { if (why_not) { why_not - << "'" << arguments().at(i).name() << "'" + << '\'' << arguments().at(i).name() << '\'' << " is not forward compatible with the older version of the schema"; } return false; @@ -511,7 +511,7 @@ bool FunctionSchema::isForwardCompatibleWith( .isForwardCompatibleWith(old.arguments().at(i))) { if (why_not) { why_not << "Out argument '" - << "'" << arguments().at(i).name() + << '\'' << arguments().at(i).name() << " is not FC with the older version of the schema"; } return false; diff --git a/aten/src/ATen/core/function_schema.h b/aten/src/ATen/core/function_schema.h index c3e1520dc9868..f349567c26478 100644 --- a/aten/src/ATen/core/function_schema.h +++ b/aten/src/ATen/core/function_schema.h @@ -571,7 +571,7 @@ inline std::ostream& operator<<(std::ostream& out, const Argument& arg) { if (arg.N()) { N = std::to_string(*arg.N()); } - out << "[" << N << "]"; + out << '[' << N << ']'; } else { out << unopt_type->str(); } @@ -582,15 +582,15 @@ inline std::ostream& operator<<(std::ostream& out, const Argument& arg) { } if (is_opt) { - out << "?"; + out << '?'; } if (!arg.name().empty()) { - out << " " << arg.name(); + out << ' ' << arg.name(); } if (arg.default_value()) { - out << "="; + out << '='; if ((type->kind() == c10::TypeKind::StringType || unopt_type->kind() == c10::TypeKind::StringType) && arg.default_value().value().isString()) { diff --git a/aten/src/ATen/core/ivalue.cpp b/aten/src/ATen/core/ivalue.cpp index 1ff8dd0410949..6e4ee82ab1137 100644 --- a/aten/src/ATen/core/ivalue.cpp +++ b/aten/src/ATen/core/ivalue.cpp @@ -66,7 +66,7 @@ bool operator==(const ivalue::Tuple& lhs, const ivalue::Tuple& rhs) { } std::ostream& operator<<(std::ostream& out, const ivalue::EnumHolder& v) { - out << v.qualifiedClassName() << "." << v.name(); + out << v.qualifiedClassName() << '.' << v.name(); return out; } @@ -526,7 +526,7 @@ std::ostream& printMaybeAnnotatedList( !elementTypeCanBeInferredFromMembers(list_elem_type)) { out << "annotate(" << the_list.type()->annotation_str() << ", "; printList(out, the_list.toListRef(), "[", "]", formatter); - out << ")"; + out << ')'; return out; } else { return printList(out, the_list.toListRef(), "[", "]", formatter); @@ -538,7 +538,7 @@ std::ostream& printDict( std::ostream& out, const Dict& v, const IValueFormatter& formatter) { - out << "{"; + out << '{'; bool first = true; for (const auto& pair : v) { @@ -552,7 +552,7 @@ std::ostream& printDict( first = false; } - out << "}"; + out << '}'; return out; } } @@ -565,8 +565,8 @@ static std::ostream& printMaybeAnnotatedDict( auto value_type = the_dict.type()->castRaw()->getValueType(); if (the_dict.toGenericDict().empty() || !elementTypeCanBeInferredFromMembers(value_type)) { - out << "annotate(" << the_dict.type()->annotation_str() << ","; - printDict(out, the_dict.toGenericDict(), formatter) << ")"; + out << "annotate(" << the_dict.type()->annotation_str() << ','; + printDict(out, the_dict.toGenericDict(), formatter) << ')'; } else { return printDict(out, the_dict.toGenericDict(), formatter); } @@ -577,7 +577,7 @@ static std::ostream& printComplex(std::ostream & out, const IValue & v) { c10::complex d = v.toComplexDouble(); IValue real(d.real()), imag(std::abs(d.imag())); auto sign = d.imag() >= 0 ? '+' : '-'; - return out << real << sign << imag << "j"; + return out << real << sign << imag << 'j'; } std::ostream& IValue::repr( @@ -605,9 +605,9 @@ std::ostream& IValue::repr( if (static_cast(i) == d) { // -0.0 (signed zero) needs to be parsed as -0. if (i == 0 && std::signbit(d)) { - return out << "-" << i << "."; + return out << '-' << i << '.'; } - return out << i << "."; + return out << i << '.'; } } auto orig_prec = out.precision(); @@ -643,20 +643,20 @@ std::ostream& IValue::repr( device_stream << v.toDevice(); out << "torch.device("; c10::printQuotedString(out, device_stream.str()); - return out << ")"; + return out << ')'; } case IValue::Tag::Generator: { auto generator = v.toGenerator(); out << "torch.Generator(device="; c10::printQuotedString(out, generator.device().str()); - out << ", seed=" << generator.current_seed() << ")"; + out << ", seed=" << generator.current_seed() << ')'; return out; } case IValue::Tag::GenericDict: return printMaybeAnnotatedDict(out, v, formatter); case IValue::Tag::Enum: { auto enum_holder = v.toEnumHolder(); - return out << enum_holder->qualifiedClassName() << "." << + return out << enum_holder->qualifiedClassName() << '.' << enum_holder->name(); } case IValue::Tag::Object: { @@ -801,7 +801,7 @@ std::ostream& operator<<(std::ostream & out, const IValue & v) { if (c == FP_NORMAL || c == FP_ZERO) { int64_t i = static_cast(d); if (static_cast(i) == d) { - return out << i << "."; + return out << i << '.'; } } auto orig_prec = out.precision(); @@ -852,7 +852,7 @@ std::ostream& operator<<(std::ostream & out, const IValue & v) { return printDict(out, v.toGenericDict(), formatter); case IValue::Tag::PyObject: { auto py_obj = v.toPyObject(); - return out << ""; + return out << "'; } case IValue::Tag::Generator: return out << "Generator"; @@ -862,22 +862,22 @@ std::ostream& operator<<(std::ostream & out, const IValue & v) { // TODO we should attempt to call __str__ if the object defines it. auto obj = v.toObject(); // print this out the way python would do it - return out << "<" << obj->name() << " object at " << obj.get() << ">"; + return out << '<' << obj->name() << " object at " << obj.get() << '>'; } case IValue::Tag::Enum: { auto enum_holder = v.toEnumHolder(); - return out << "Enum<" << enum_holder->unqualifiedClassName() << "." << - enum_holder->name() << ">"; + return out << "Enum<" << enum_holder->unqualifiedClassName() << '.' << + enum_holder->name() << '>'; } } - return out << ""; + return out << " ivalue::Object::type() const { @@ -1050,7 +1050,7 @@ c10::intrusive_ptr ivalue::Object::deepcopy( std::stringstream err; err << "Cannot serialize custom bound C++ class"; if (auto qualname = type()->name()) { - err << " " << qualname->qualifiedName(); + err << ' ' << qualname->qualifiedName(); } err << ". Please define serialization methods via def_pickle() for " "this class."; diff --git a/aten/src/ATen/core/ivalue.h b/aten/src/ATen/core/ivalue.h index f13b0613691b4..73aed03da073d 100644 --- a/aten/src/ATen/core/ivalue.h +++ b/aten/src/ATen/core/ivalue.h @@ -18,6 +18,8 @@ #include #include +C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wswitch-default") + namespace torch { class TORCH_API CustomClassHolder : public c10::intrusive_ptr_target {}; namespace jit { @@ -1630,4 +1632,6 @@ struct TORCH_API WeakOrStrongTypePtr { } // namespace c10 +C10_DIAGNOSTIC_POP() + #include // IWYU pragma: keep diff --git a/aten/src/ATen/core/ivalue_inl.h b/aten/src/ATen/core/ivalue_inl.h index 8d1c3aa83dadb..ac7540cffd18f 100644 --- a/aten/src/ATen/core/ivalue_inl.h +++ b/aten/src/ATen/core/ivalue_inl.h @@ -29,6 +29,8 @@ #include #include +C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wswitch-default") + namespace torch { namespace jit { struct Function; @@ -2567,3 +2569,5 @@ TypePtr IValue::type() const { } } // namespace c10 + +C10_DIAGNOSTIC_POP() diff --git a/aten/src/ATen/core/jit_type.h b/aten/src/ATen/core/jit_type.h index 666d1ade5789c..535831ea11d6e 100644 --- a/aten/src/ATen/core/jit_type.h +++ b/aten/src/ATen/core/jit_type.h @@ -211,7 +211,7 @@ struct TORCH_API OptionalType : public UnionType { std::string str() const override { std::stringstream ss; - ss << getElementType()->str() << "?"; + ss << getElementType()->str() << '?'; return ss.str(); } @@ -240,7 +240,7 @@ struct TORCH_API OptionalType : public UnionType { std::string annotation_str_impl(const TypePrinter& printer = nullptr) const override { std::stringstream ss; - ss << "Optional[" << getElementType()->annotation_str(printer) << "]"; + ss << "Optional[" << getElementType()->annotation_str(printer) << ']'; return ss.str(); } }; @@ -906,7 +906,7 @@ struct TORCH_API ListType std::string annotation_str_impl(const TypePrinter& printer = nullptr) const override { std::stringstream ss; - ss << "List[" << getElementType()->annotation_str(printer) << "]"; + ss << "List[" << getElementType()->annotation_str(printer) << ']'; return ss.str(); } }; @@ -946,7 +946,7 @@ struct TORCH_API DictType : public SharedType { std::string str() const override { std::stringstream ss; ss << "Dict(" << getKeyType()->str() << ", " << getValueType()->str() - << ")"; + << ')'; return ss.str(); } @@ -1018,7 +1018,7 @@ struct TORCH_API FutureType std::string str() const override { std::stringstream ss; - ss << "Future(" << getElementType()->str() << ")"; + ss << "Future(" << getElementType()->str() << ')'; return ss.str(); } TypePtr createWithContained( @@ -1041,7 +1041,7 @@ struct TORCH_API FutureType std::string annotation_str_impl(const TypePrinter& printer = nullptr) const override { std::stringstream ss; - ss << "Future[" << getElementType()->annotation_str(printer) << "]"; + ss << "Future[" << getElementType()->annotation_str(printer) << ']'; return ss.str(); } }; @@ -1060,7 +1060,7 @@ struct TORCH_API AwaitType std::string str() const override { std::stringstream ss; - ss << "Await(" << getElementType()->str() << ")"; + ss << "Await(" << getElementType()->str() << ')'; return ss.str(); } TypePtr createWithContained( @@ -1083,7 +1083,7 @@ struct TORCH_API AwaitType std::string annotation_str_impl(const TypePrinter& printer = nullptr) const override { std::stringstream ss; - ss << "Await[" << getElementType()->annotation_str(printer) << "]"; + ss << "Await[" << getElementType()->annotation_str(printer) << ']'; return ss.str(); } }; @@ -1102,7 +1102,7 @@ struct TORCH_API RRefType std::string str() const override { std::stringstream ss; - ss << "RRef(" << getElementType()->str() << ")"; + ss << "RRef(" << getElementType()->str() << ')'; return ss.str(); } TypePtr createWithContained( @@ -1115,7 +1115,7 @@ struct TORCH_API RRefType std::string annotation_str_impl(const TypePrinter& printer = nullptr) const override { std::stringstream ss; - ss << "RRef[" << getElementType()->annotation_str(printer) << "]"; + ss << "RRef[" << getElementType()->annotation_str(printer) << ']'; return ss.str(); } }; diff --git a/aten/src/ATen/core/operator_name.cpp b/aten/src/ATen/core/operator_name.cpp index 43a1fd24749a7..e55a84a4d305a 100644 --- a/aten/src/ATen/core/operator_name.cpp +++ b/aten/src/ATen/core/operator_name.cpp @@ -11,7 +11,7 @@ std::string toString(const OperatorName& opName) { std::ostream& operator<<(std::ostream& os, const OperatorName& opName) { os << opName.name; if (!opName.overload_name.empty()) { - os << "." << opName.overload_name; + os << '.' << opName.overload_name; } return os; } diff --git a/aten/src/ATen/core/tensor_type.cpp b/aten/src/ATen/core/tensor_type.cpp index 9d8080cb8f317..d428aceb3d04c 100644 --- a/aten/src/ATen/core/tensor_type.cpp +++ b/aten/src/ATen/core/tensor_type.cpp @@ -65,7 +65,7 @@ VaryingShape VaryingShape::merge(const VaryingShape& other) const { template std::ostream& operator<<(std::ostream& out, const VaryingShape& vs) { - out << "("; + out << '('; if (!vs.size()) { out << "*)"; return out; @@ -79,10 +79,10 @@ std::ostream& operator<<(std::ostream& out, const VaryingShape& vs) { if (v.has_value()) { out << v.value(); } else { - out << "*"; + out << '*'; } } - out << ")"; + out << ')'; return out; } @@ -105,7 +105,7 @@ std::ostream& operator<<( } auto sizes_opt = ss.sizes(); - os << "("; + os << '('; for (size_t i = 0; i < rank_opt.value(); i++) { if (i > 0) { os << ", "; @@ -113,10 +113,10 @@ std::ostream& operator<<( if(sizes_opt.has_value() && sizes_opt.value()[i].is_static()) { os << sizes_opt.value()[i]; } else { - os << "*"; + os << '*'; } } - os << ")"; + os << ')'; return os; } @@ -131,17 +131,17 @@ std::ostream& operator<<(std::ostream& os, const ShapeSymbol& s) { } std::ostream& operator<<(std::ostream& os, const Stride& s) { - os << "{"; + os << '{'; if (s.stride_index_.has_value()) { os << *s.stride_index_; } else { - os << "*"; + os << '*'; } - os << ":"; + os << ':'; if (s.stride_.has_value()) { os << *s.stride_; } else { - os << "*"; + os << '*'; } os << '}'; return os; diff --git a/aten/src/ATen/core/type.cpp b/aten/src/ATen/core/type.cpp index abba4e14583a3..46dc550b1f37b 100644 --- a/aten/src/ATen/core/type.cpp +++ b/aten/src/ATen/core/type.cpp @@ -67,7 +67,7 @@ std::ostream& operator<<(std::ostream & out, const Type & t) { bool has_valid_strides_info = ndim > 0 && value->strides().isComplete() && value->strides().size() == ndim; - out << "("; + out << '('; size_t i = 0; bool symbolic = type_verbosity() == TypeVerbosity::Symbolic; for (i = 0; i < *ndim; ++i) { @@ -79,7 +79,7 @@ std::ostream& operator<<(std::ostream & out, const Type & t) { } else if (symbolic) { out << value->symbolic_sizes().at(i); } else { - out << "*"; + out << '*'; } } if (has_valid_strides_info && @@ -91,7 +91,7 @@ std::ostream& operator<<(std::ostream & out, const Type & t) { } out << value->strides()[i].value(); } - out << "]"; + out << ']'; } if (type_verbosity() >= TypeVerbosity::Full) { if (value->requiresGrad()) { @@ -107,12 +107,12 @@ std::ostream& operator<<(std::ostream & out, const Type & t) { out << "device=" << *value->device(); } } - out << ")"; + out << ')'; } else { if (type_verbosity() >= TypeVerbosity::Full) { size_t i = 0; if (value->requiresGrad()) { - out << "(" + out << '(' << "requires_grad=" << *value->requiresGrad(); i++; } @@ -120,7 +120,7 @@ std::ostream& operator<<(std::ostream & out, const Type & t) { out << ((i++ > 0) ? ", " : "(") << "device=" << *value->device(); } if (i > 0) { - out << ")"; + out << ')'; } } } @@ -133,18 +133,18 @@ std::ostream& operator<<(std::ostream & out, const Type & t) { out << *prim << "[]"; } else if (t.kind() == TypeKind::OptionalType) { auto prim = t.castRaw()->getElementType(); - out << *prim << "?"; + out << *prim << '?'; } else if(t.kind() == TypeKind::FutureType) { auto elem = t.castRaw()->getElementType(); - out << "Future[" << *elem << "]"; + out << "Future[" << *elem << ']'; } else if(t.kind() == TypeKind::RRefType) { auto elem = t.castRaw()->getElementType(); - out << "RRef[" << *elem << "]"; + out << "RRef[" << *elem << ']'; } else if(auto tup = t.cast()) { if (tup->schema()) { out << "NamedTuple"; } - out << "("; + out << '('; for(size_t i = 0; i < tup->elements().size(); ++i) { if(i > 0) out << ", "; @@ -160,7 +160,7 @@ std::ostream& operator<<(std::ostream & out, const Type & t) { out << *(tup->elements()[i]); } } - out << ")"; + out << ')'; } else if (t.kind() == TypeKind::FunctionType) { out << "Function"; } else { @@ -475,7 +475,7 @@ std::optional unifyTypeList( why_not << "Could not unify type list since element " << i << " of type " << elements.at(i)->repr_str() << " did not match the types before it (" - << ret_type->repr_str() << ")"; + << ret_type->repr_str() << ')'; return std::nullopt; } ret_type = *maybe_unified; @@ -907,13 +907,13 @@ std::string TupleType::str() const { // NOLINTNEXTLINE(bugprone-unchecked-optional-access) ss << name()->qualifiedName(); } else { - ss << "("; + ss << '('; for(size_t i = 0; i < elements().size(); ++i) { if(i > 0) ss << ", "; ss << elements()[i]->str(); } - ss << ")"; + ss << ')'; } return ss.str(); } @@ -1003,8 +1003,8 @@ bool InterfaceType::isSubTypeImpl( *why_not << "Method on interface '" << lhs.repr_str() << "' (1) is not compatible with interface '" << rhs.repr_str() << "' (2)\n" - << " (1) " << *self_schema << "\n" - << " (2) " << schema << "\n"; + << " (1) " << *self_schema << '\n' + << " (2) " << schema << '\n'; return false; } return false; @@ -1078,7 +1078,7 @@ SymbolicShape SymbolicShape::merge(const SymbolicShape& other) const { } void SymbolicShape::dump() const { - std::cout << *this << "\n"; + std::cout << *this << '\n'; } bool EnumType::isSubtypeOfExt(const Type& rhs, std::ostream* why_not) const { diff --git a/aten/src/ATen/core/union_type.cpp b/aten/src/ATen/core/union_type.cpp index dc4cb78872182..8731c2cbc4952 100644 --- a/aten/src/ATen/core/union_type.cpp +++ b/aten/src/ATen/core/union_type.cpp @@ -205,9 +205,9 @@ UnionType::UnionType(std::vector reference, TypeKind kind) : SharedType for (const auto i : c10::irange(reference.size())) { msg << reference[i]->repr_str(); if (i > 0) { - msg << ","; + msg << ','; } - msg << " "; + msg << ' '; } msg << "} has the single type " << types_[0]->repr_str() << ". Use the common supertype instead of creating a Union" diff --git a/aten/src/ATen/cpu/vec/sve/vec_bfloat16.h b/aten/src/ATen/cpu/vec/sve/vec_bfloat16.h index 9e0b189bdac89..757ef839f965a 100644 --- a/aten/src/ATen/cpu/vec/sve/vec_bfloat16.h +++ b/aten/src/ATen/cpu/vec/sve/vec_bfloat16.h @@ -191,7 +191,7 @@ class Vectorized { auto vals = svreinterpret_u16_bf16(values); vals = sveor_u16_x(ptrue, vals, mask); return svreinterpret_bf16_u16(vals); - }; + } Vectorized round() const; Vectorized tan() const; Vectorized tanh() const; @@ -349,47 +349,47 @@ Vectorized inline Vectorized::frac() const { return convert_float_bfloat16(v1, v2); \ } -DEFINE_BF16_FUNC_VIA_FLOAT(isnan); -DEFINE_BF16_FUNC_VIA_FLOAT(angle); -DEFINE_BF16_FUNC_VIA_FLOAT(acos); -DEFINE_BF16_FUNC_VIA_FLOAT(acosh); -DEFINE_BF16_FUNC_VIA_FLOAT(asin); -DEFINE_BF16_FUNC_VIA_FLOAT(atan); -DEFINE_BF16_FUNC_VIA_FLOAT(atanh); -DEFINE_BF16_FUNC_VIA_FLOAT_W_ARG(atan2); -DEFINE_BF16_FUNC_VIA_FLOAT_W_ARG(copysign); -DEFINE_BF16_FUNC_VIA_FLOAT(erf); -DEFINE_BF16_FUNC_VIA_FLOAT(erfc); -DEFINE_BF16_FUNC_VIA_FLOAT(exp); -DEFINE_BF16_FUNC_VIA_FLOAT(exp2); -DEFINE_BF16_FUNC_VIA_FLOAT(expm1); -DEFINE_BF16_FUNC_VIA_FLOAT_W_ARG(fmod); -DEFINE_BF16_FUNC_VIA_FLOAT_W_ARG(hypot); -DEFINE_BF16_FUNC_VIA_FLOAT(i0); -DEFINE_BF16_FUNC_VIA_FLOAT(i0e); -DEFINE_BF16_FUNC_VIA_FLOAT(digamma); -DEFINE_BF16_FUNC_VIA_FLOAT_W_ARG(igamma); -DEFINE_BF16_FUNC_VIA_FLOAT_W_ARG(igammac); -DEFINE_BF16_FUNC_VIA_FLOAT_W_ARG(nextafter); -DEFINE_BF16_FUNC_VIA_FLOAT(log); -DEFINE_BF16_FUNC_VIA_FLOAT(log2); -DEFINE_BF16_FUNC_VIA_FLOAT(log10); -DEFINE_BF16_FUNC_VIA_FLOAT(log1p); -DEFINE_BF16_FUNC_VIA_FLOAT(sin); -DEFINE_BF16_FUNC_VIA_FLOAT(sinh); -DEFINE_BF16_FUNC_VIA_FLOAT(cos); -DEFINE_BF16_FUNC_VIA_FLOAT(cosh); -DEFINE_BF16_FUNC_VIA_FLOAT(ceil); -DEFINE_BF16_FUNC_VIA_FLOAT(floor); -DEFINE_BF16_FUNC_VIA_FLOAT(round); -DEFINE_BF16_FUNC_VIA_FLOAT(tan); -DEFINE_BF16_FUNC_VIA_FLOAT(tanh); -DEFINE_BF16_FUNC_VIA_FLOAT(trunc); -DEFINE_BF16_FUNC_VIA_FLOAT(lgamma); -DEFINE_BF16_FUNC_VIA_FLOAT(sqrt); -DEFINE_BF16_FUNC_VIA_FLOAT(reciprocal); -DEFINE_BF16_FUNC_VIA_FLOAT(rsqrt); -DEFINE_BF16_FUNC_VIA_FLOAT_W_ARG(pow); +DEFINE_BF16_FUNC_VIA_FLOAT(isnan) +DEFINE_BF16_FUNC_VIA_FLOAT(angle) +DEFINE_BF16_FUNC_VIA_FLOAT(acos) +DEFINE_BF16_FUNC_VIA_FLOAT(acosh) +DEFINE_BF16_FUNC_VIA_FLOAT(asin) +DEFINE_BF16_FUNC_VIA_FLOAT(atan) +DEFINE_BF16_FUNC_VIA_FLOAT(atanh) +DEFINE_BF16_FUNC_VIA_FLOAT_W_ARG(atan2) +DEFINE_BF16_FUNC_VIA_FLOAT_W_ARG(copysign) +DEFINE_BF16_FUNC_VIA_FLOAT(erf) +DEFINE_BF16_FUNC_VIA_FLOAT(erfc) +DEFINE_BF16_FUNC_VIA_FLOAT(exp) +DEFINE_BF16_FUNC_VIA_FLOAT(exp2) +DEFINE_BF16_FUNC_VIA_FLOAT(expm1) +DEFINE_BF16_FUNC_VIA_FLOAT_W_ARG(fmod) +DEFINE_BF16_FUNC_VIA_FLOAT_W_ARG(hypot) +DEFINE_BF16_FUNC_VIA_FLOAT(i0) +DEFINE_BF16_FUNC_VIA_FLOAT(i0e) +DEFINE_BF16_FUNC_VIA_FLOAT(digamma) +DEFINE_BF16_FUNC_VIA_FLOAT_W_ARG(igamma) +DEFINE_BF16_FUNC_VIA_FLOAT_W_ARG(igammac) +DEFINE_BF16_FUNC_VIA_FLOAT_W_ARG(nextafter) +DEFINE_BF16_FUNC_VIA_FLOAT(log) +DEFINE_BF16_FUNC_VIA_FLOAT(log2) +DEFINE_BF16_FUNC_VIA_FLOAT(log10) +DEFINE_BF16_FUNC_VIA_FLOAT(log1p) +DEFINE_BF16_FUNC_VIA_FLOAT(sin) +DEFINE_BF16_FUNC_VIA_FLOAT(sinh) +DEFINE_BF16_FUNC_VIA_FLOAT(cos) +DEFINE_BF16_FUNC_VIA_FLOAT(cosh) +DEFINE_BF16_FUNC_VIA_FLOAT(ceil) +DEFINE_BF16_FUNC_VIA_FLOAT(floor) +DEFINE_BF16_FUNC_VIA_FLOAT(round) +DEFINE_BF16_FUNC_VIA_FLOAT(tan) +DEFINE_BF16_FUNC_VIA_FLOAT(tanh) +DEFINE_BF16_FUNC_VIA_FLOAT(trunc) +DEFINE_BF16_FUNC_VIA_FLOAT(lgamma) +DEFINE_BF16_FUNC_VIA_FLOAT(sqrt) +DEFINE_BF16_FUNC_VIA_FLOAT(reciprocal) +DEFINE_BF16_FUNC_VIA_FLOAT(rsqrt) +DEFINE_BF16_FUNC_VIA_FLOAT_W_ARG(pow) Vectorized inline Vectorized::operator==( const Vectorized& other) const { diff --git a/aten/src/ATen/cpu/vec/vec128/vec128_convert.h b/aten/src/ATen/cpu/vec/vec128/vec128_convert.h index e968389987fc5..060d60fa3e2d8 100644 --- a/aten/src/ATen/cpu/vec/vec128/vec128_convert.h +++ b/aten/src/ATen/cpu/vec/vec128/vec128_convert.h @@ -223,6 +223,62 @@ CONVERT_FROM_BF16_TEMPLATE(double) CONVERT_FROM_BF16_TEMPLATE(float16_t) #endif +#ifdef __ARM_FEATURE_BF16 + +// clang-[17, 20] crashes when autovectorizing static cast to bf16 +// Below is a workaround to have some vectorization +// Works decently well for smaller int types +template +inline void convertToBf16Impl( + const from_type* __restrict src, + c10::BFloat16* __restrict dst, + uint64_t n) { + bfloat16_t* dstPtr = reinterpret_cast(dst); + uint64_t loopBound = n - (n % 16); + uint64_t i = 0; + for (; i < loopBound; i += 16) { + float32x4_t a, b, c, d; + a[0] = static_cast(src[i]); + a[1] = static_cast(src[i + 1]); + a[2] = static_cast(src[i + 2]); + a[3] = static_cast(src[i + 3]); + b[0] = static_cast(src[i + 4]); + b[1] = static_cast(src[i + 5]); + b[2] = static_cast(src[i + 6]); + b[3] = static_cast(src[i + 7]); + c[0] = static_cast(src[i + 8]); + c[1] = static_cast(src[i + 9]); + c[2] = static_cast(src[i + 10]); + c[3] = static_cast(src[i + 11]); + d[0] = static_cast(src[i + 12]); + d[1] = static_cast(src[i + 13]); + d[2] = static_cast(src[i + 14]); + d[3] = static_cast(src[i + 15]); + + vst1q_bf16(dstPtr + i, vcvtq_high_bf16_f32(vcvtq_low_bf16_f32(a), b)); + vst1q_bf16(dstPtr + i + 8, vcvtq_high_bf16_f32(vcvtq_low_bf16_f32(c), d)); + } + +#pragma clang loop vectorize(disable) interleave(disable) unroll(disable) + for (; i < n; i++) { + float a = static_cast(src[i]); + dstPtr[i] = vcvth_bf16_f32(a); + } +} + +#define CONVERT_TO_BF16_TEMPLATE(from_type) \ + template <> \ + inline void convert(const from_type* src, c10::BFloat16* dst, int64_t n) { \ + return convertToBf16Impl(src, dst, n); \ + } + +CONVERT_TO_BF16_TEMPLATE(uint8_t) +CONVERT_TO_BF16_TEMPLATE(int8_t) +CONVERT_TO_BF16_TEMPLATE(int16_t) +CONVERT_TO_BF16_TEMPLATE(int32_t) + +#endif + inline void convertBoolToBfloat16Impl( const bool* __restrict src, c10::BFloat16* __restrict dst, diff --git a/aten/src/ATen/cpu/vec/vec128/vec128_float_neon.h b/aten/src/ATen/cpu/vec/vec128/vec128_float_neon.h index c479fc2e4aeb2..6a64226475cf3 100644 --- a/aten/src/ATen/cpu/vec/vec128/vec128_float_neon.h +++ b/aten/src/ATen/cpu/vec/vec128/vec128_float_neon.h @@ -11,6 +11,8 @@ #include #endif +C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wswitch-default") + // Sleef offers vectorized versions of some transcedentals // such as sin, cos, tan etc.. // However for now opting for STL, since we are not building @@ -650,3 +652,5 @@ inline Vectorized Vectorized::erf() const { } // namespace CPU_CAPABILITY } // namespace at::vec + +C10_DIAGNOSTIC_POP() diff --git a/aten/src/ATen/cpu/vec/vec256/vec256.h b/aten/src/ATen/cpu/vec/vec256/vec256.h index 50c3cc31a6c48..a2eb9e5f45104 100644 --- a/aten/src/ATen/cpu/vec/vec256/vec256.h +++ b/aten/src/ATen/cpu/vec/vec256/vec256.h @@ -80,7 +80,7 @@ std::ostream& operator<<(std::ostream& stream, const Vectorized& vec) { } stream << buf[i]; } - stream << "]"; + stream << ']'; return stream; } diff --git a/aten/src/ATen/cpu/vec/vec512/vec512.h b/aten/src/ATen/cpu/vec/vec512/vec512.h index 975b71ce9a867..623971454df8b 100644 --- a/aten/src/ATen/cpu/vec/vec512/vec512.h +++ b/aten/src/ATen/cpu/vec/vec512/vec512.h @@ -55,7 +55,7 @@ std::ostream& operator<<(std::ostream& stream, const Vectorized& vec) { } stream << buf[i]; } - stream << "]"; + stream << ']'; return stream; } diff --git a/aten/src/ATen/cuda/CUDABlas.cpp b/aten/src/ATen/cuda/CUDABlas.cpp index aaed431064611..9a55b058001da 100644 --- a/aten/src/ATen/cuda/CUDABlas.cpp +++ b/aten/src/ATen/cuda/CUDABlas.cpp @@ -388,6 +388,7 @@ static inline bool bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(D #ifndef USE_ROCM at::Half halpha; at::Half hbeta; + uint32_t mask = -1; #endif void * alpha_ptr = α void * beta_ptr = β @@ -427,7 +428,7 @@ static inline bool bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(D auto fp16_reduction = at::globalContext().allowFP16ReductionCuBLAS(); if (fp16_reduction != at::CuBLASReductionOption::AllowReducedPrecisionWithSplitK) { - uint32_t mask = + mask = fp16_reduction == at::CuBLASReductionOption::DisallowReducedPrecisionAllowSplitK ? (CUBLASLT_REDUCTION_SCHEME_COMPUTE_TYPE | @@ -444,7 +445,7 @@ static inline bool bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(D auto bf16_reduction = at::globalContext().allowBF16ReductionCuBLAS(); if (bf16_reduction != at::CuBLASReductionOption::AllowReducedPrecisionWithSplitK) { - uint32_t mask = + mask = bf16_reduction == at::CuBLASReductionOption::DisallowReducedPrecisionAllowSplitK ? (CUBLASLT_REDUCTION_SCHEME_COMPUTE_TYPE | @@ -511,17 +512,41 @@ static inline bool bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(D cublasStatus_t cublasStatus = CUBLAS_STATUS_SUCCESS; cublasLtMatmulHeuristicResult_t heuristicResult = {}; int returnedResult = 0; - TORCH_CUDABLAS_CHECK(cublasLtMatmulAlgoGetHeuristic( - ltHandle, - computeDesc.descriptor(), - Adesc.descriptor(), - Bdesc.descriptor(), - Cdesc.descriptor(), - Cdesc.descriptor(), - preference.descriptor(), - 1, - &heuristicResult, - &returnedResult)); + // on Blackwell+, we fake a n > 1 matmul when querying heuristics + // to prevent cuBLASLt from dispatching to a GEMV kernel for batch-invariance +#ifndef USE_ROCM + const bool lie_to_cublaslt = mask == CUBLASLT_REDUCTION_SCHEME_NONE && n == 1 && at::cuda::getCurrentDeviceProperties()->major >= 10; +#else + const bool lie_to_cublaslt = false; +#endif + if (lie_to_cublaslt) { + CuBlasLtMatrixLayout FakeBdesc(abType, k, 2, ldb, opb == CUBLAS_OP_T); + CuBlasLtMatrixLayout FakeCdesc(cType, m, 2, ldc); + + TORCH_CUDABLAS_CHECK(cublasLtMatmulAlgoGetHeuristic( + ltHandle, + computeDesc.descriptor(), + Adesc.descriptor(), + FakeBdesc.descriptor(), + FakeCdesc.descriptor(), + FakeCdesc.descriptor(), + preference.descriptor(), + 1, + &heuristicResult, + &returnedResult)); + } else { + TORCH_CUDABLAS_CHECK(cublasLtMatmulAlgoGetHeuristic( + ltHandle, + computeDesc.descriptor(), + Adesc.descriptor(), + Bdesc.descriptor(), + Cdesc.descriptor(), + Cdesc.descriptor(), + preference.descriptor(), + 1, + &heuristicResult, + &returnedResult)); + } if (returnedResult == 0) { cublasStatus = CUBLAS_STATUS_NOT_SUPPORTED; } @@ -1572,7 +1597,7 @@ bool gemm_and_bias( } using opmath_t = at::opmath_type; - opmath_t beta_val = 0; // bias is added in epilogue + opmath_t beta_val = bias ? 0 : 1; // bias is added in epilogue unless nullptr cudaDataType_t abType = CUDA_R_32F; cudaDataType_t cType = CUDA_R_32F; @@ -1661,15 +1686,22 @@ bool gemm_and_bias( _syncCurrentWithCarveoutStream(stream, true); } #endif - cublasLtEpilogue_t epilogue = CUBLASLT_EPILOGUE_BIAS; - if (activation == GEMMAndBiasActivationEpilogue::RELU) { - epilogue = CUBLASLT_EPILOGUE_RELU_BIAS; - } else if (activation == GEMMAndBiasActivationEpilogue::GELU) { - epilogue = CUBLASLT_EPILOGUE_GELU_BIAS; - } + const auto epilogue = [&]() -> cublasLtEpilogue_t { + // The cuBLAS documentation indicates that + // *__BIAS = *_, + // but we keep it verbose here for clarity. + switch (activation) { + case GEMMAndBiasActivationEpilogue::RELU: + return bias ? CUBLASLT_EPILOGUE_RELU_BIAS : CUBLASLT_EPILOGUE_RELU; + case GEMMAndBiasActivationEpilogue::GELU: + return bias ? CUBLASLT_EPILOGUE_GELU_BIAS : CUBLASLT_EPILOGUE_GELU; + default: + return bias ? CUBLASLT_EPILOGUE_BIAS : CUBLASLT_EPILOGUE_DEFAULT; + } + }(); + computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_EPILOGUE, epilogue); - if (bias != nullptr) { - computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_EPILOGUE, epilogue); + if (bias) { computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_BIAS_POINTER, bias); } diff --git a/aten/src/ATen/cuda/CUDAContextLight.h b/aten/src/ATen/cuda/CUDAContextLight.h index 86e960cc1ab4a..01d10f61da692 100644 --- a/aten/src/ATen/cuda/CUDAContextLight.h +++ b/aten/src/ATen/cuda/CUDAContextLight.h @@ -3,6 +3,7 @@ #include #include +#include #include #include @@ -88,8 +89,13 @@ TORCH_CUDA_CPP_API cublasHandle_t getCurrentCUDABlasHandle(); TORCH_CUDA_CPP_API cublasLtHandle_t getCurrentCUDABlasLtHandle(); TORCH_CUDA_CPP_API void clearCublasWorkspaces(); -TORCH_CUDA_CPP_API std::map, at::DataPtr>& cublas_handle_stream_to_workspace(); -TORCH_CUDA_CPP_API std::map, at::DataPtr>& cublaslt_handle_stream_to_workspace(); +struct WorkspaceMapWithMutex { + std::map, at::DataPtr> map; + std::shared_mutex mutex; +}; + +TORCH_CUDA_CPP_API WorkspaceMapWithMutex& cublas_handle_stream_to_workspace(); +TORCH_CUDA_CPP_API WorkspaceMapWithMutex& cublaslt_handle_stream_to_workspace(); TORCH_CUDA_CPP_API size_t getChosenWorkspaceSize(); TORCH_CUDA_CPP_API size_t getCUDABlasLtWorkspaceSize(); TORCH_CUDA_CPP_API void* getCUDABlasLtWorkspace(); diff --git a/aten/src/ATen/cuda/CUDAGraph.cpp b/aten/src/ATen/cuda/CUDAGraph.cpp index 31d2d3f1fe589..1c0687dcd5fb7 100644 --- a/aten/src/ATen/cuda/CUDAGraph.cpp +++ b/aten/src/ATen/cuda/CUDAGraph.cpp @@ -1,6 +1,7 @@ #include #include #include +#include #include #include @@ -13,7 +14,7 @@ static bool _cuda_graphs_debug = false; MempoolId_t graph_pool_handle() { // Sets just the second value, to distinguish it from MempoolId_ts created from // cudaStreamGetCaptureInfo id_s in capture_begin. - return c10::cuda::MemPool::graph_pool_handle(); + return at::cuda::MemPool::graph_pool_handle(); } /** @@ -90,7 +91,7 @@ void CUDAGraph::capture_begin(MempoolId_t pool/*=0*/, cudaStreamCaptureMode capt } else { // User did not ask us to share a mempool. Create graph pool handle using is_user_created=false. // Sets just the first value, to distinguish it from MempoolId_ts created by graph_pool_handle(). - mempool_id_ = c10::cuda::MemPool::graph_pool_handle(false); + mempool_id_ = at::cuda::MemPool::graph_pool_handle(false); TORCH_INTERNAL_ASSERT(mempool_id_.first > 0); } @@ -174,17 +175,24 @@ void CUDAGraph::instantiate() { // Trailing NULL, NULL, 0 arguments were recommended by Cuda driver people, // who prefer not to report error message through these arguments moving forward // (they prefer return value, or errors on api calls internal to the capture) -#if (defined(CUDA_VERSION) && CUDA_VERSION >= 12000) - AT_CUDA_CHECK(cudaGraphInstantiate(&graph_exec_, graph_, 0)); + // ROCM appears to fail with HIP error: invalid argument +#if (defined(CUDA_VERSION) && CUDA_VERSION >= 12000) && !defined(USE_ROCM) + AT_CUDA_CHECK(cudaGraphInstantiate(&graph_exec_, graph_, cudaGraphInstantiateFlagUseNodePriority)); #else AT_CUDA_CHECK(cudaGraphInstantiate(&graph_exec_, graph_, NULL, NULL, 0)); #endif //Since ROCm 6.2, we want to go down this path as hipGraphExecDestroy in the destructor will not immediately free the memory. //It will wait for the next sync operation. cudaGraphInstantiateFlagAutoFreeOnLaunch will add async frees after graph launch. } else { +#if !defined(USE_ROCM) + AT_CUDA_CHECK(cudaGraphInstantiateWithFlags(&graph_exec_, + graph_, + cudaGraphInstantiateFlagAutoFreeOnLaunch | cudaGraphInstantiateFlagUseNodePriority)); +#else AT_CUDA_CHECK(cudaGraphInstantiateWithFlags(&graph_exec_, graph_, cudaGraphInstantiateFlagAutoFreeOnLaunch)); +#endif } has_graph_exec_ = true; } diff --git a/aten/src/ATen/cuda/CublasHandlePool.cpp b/aten/src/ATen/cuda/CublasHandlePool.cpp index 6175e69827e2f..9ec3acf4cd29e 100644 --- a/aten/src/ATen/cuda/CublasHandlePool.cpp +++ b/aten/src/ATen/cuda/CublasHandlePool.cpp @@ -99,7 +99,7 @@ void destroyCublasHandle(cublasHandle_t handle) { // - Comments of @soumith copied from cuDNN handle pool implementation #ifdef NO_CUDNN_DESTROY_HANDLE #else - cublasDestroy(handle); + cublasDestroy(handle); #endif } @@ -107,19 +107,27 @@ using CuBlasPoolType = DeviceThreadHandlePool, at::DataPtr>& cublas_handle_stream_to_workspace() { - static auto& instance = *new std::map, at::DataPtr>; +WorkspaceMapWithMutex& cublas_handle_stream_to_workspace() { + static auto& instance = *new WorkspaceMapWithMutex; return instance; } -std::map, at::DataPtr>& cublaslt_handle_stream_to_workspace() { - static auto& instance = *new std::map, at::DataPtr>; +WorkspaceMapWithMutex& cublaslt_handle_stream_to_workspace() { + static auto& instance = *new WorkspaceMapWithMutex; return instance; } void clearCublasWorkspaces() { - cublas_handle_stream_to_workspace().clear(); - cublaslt_handle_stream_to_workspace().clear(); + { + auto& workspace = cublas_handle_stream_to_workspace(); + std::unique_lock lock(workspace.mutex); + workspace.map.clear(); + } + { + auto& workspace = cublaslt_handle_stream_to_workspace(); + std::unique_lock lock(workspace.mutex); + workspace.map.clear(); + } } size_t parseChosenWorkspaceSize() { @@ -233,6 +241,38 @@ at::DataPtr getNewCUDABlasLtWorkspace() { return c10::cuda::CUDACachingAllocator::get()->allocate(getCUDABlasLtWorkspaceSize()); } +void setWorkspaceForHandle(cublasHandle_t handle, c10::cuda::CUDAStream stream) { + cudaStream_t _stream = stream; + auto key = std::make_tuple(static_cast(handle), static_cast(_stream)); + + auto& workspace = cublas_handle_stream_to_workspace(); + + size_t workspace_size = getChosenWorkspaceSize(); + + // Fast path: check if workspace already exists + { + std::shared_lock lock(workspace.mutex); + auto workspace_it = workspace.map.find(key); + if (workspace_it != workspace.map.end()) { + TORCH_CUDABLAS_CHECK(cublasSetWorkspace( + handle, workspace_it->second.get(), workspace_size)); + return; + } + } + + // Slow path: allocate workspace outside the lock + auto new_workspace = getNewWorkspace(); + + // Insert with lock (double-check in case another thread inserted while we + // were allocating) + { + std::unique_lock lock(workspace.mutex); + auto workspace_it = workspace.map.try_emplace(key, std::move(new_workspace)).first; + TORCH_CUDABLAS_CHECK( + cublasSetWorkspace(handle, workspace_it->second.get(), workspace_size)); + } +} + void* getCUDABlasLtWorkspace() { #ifndef USE_ROCM static bool unified = c10::utils::check_env(TORCH_CUBLASLT_UNIFIED_WORKSPACE) == true; @@ -241,8 +281,10 @@ void* getCUDABlasLtWorkspace() { auto stream = c10::cuda::getCurrentCUDAStream(); cudaStream_t _stream = stream; auto key = std::make_tuple(static_cast(handle), static_cast(_stream)); - auto workspace_it = at::cuda::cublas_handle_stream_to_workspace().find(key); - TORCH_INTERNAL_ASSERT(workspace_it != at::cuda::cublas_handle_stream_to_workspace().end()); + auto& workspace = at::cuda::cublas_handle_stream_to_workspace(); + std::shared_lock lock(workspace.mutex); + auto workspace_it = workspace.map.find(key); + TORCH_INTERNAL_ASSERT(workspace_it != workspace.map.end()); return workspace_it->second.mutable_get(); } #endif @@ -250,11 +292,29 @@ void* getCUDABlasLtWorkspace() { auto stream = c10::cuda::getCurrentCUDAStream(); cudaStream_t _stream = stream; auto key = std::make_tuple(static_cast(handle), static_cast(_stream)); - auto workspace_it = cublaslt_handle_stream_to_workspace().find(key); - if (workspace_it == cublaslt_handle_stream_to_workspace().end()) { - workspace_it = cublaslt_handle_stream_to_workspace().insert(workspace_it, {key, getNewCUDABlasLtWorkspace()}); + + auto& workspace = cublaslt_handle_stream_to_workspace(); + + // Fast path: check if workspace already exists + { + std::shared_lock lock(workspace.mutex); + auto workspace_it = workspace.map.find(key); + if (workspace_it != workspace.map.end()) { + return workspace_it->second.mutable_get(); + } + } + + // Slow path: allocate workspace outside the lock + auto new_workspace = getNewCUDABlasLtWorkspace(); + + // Insert with lock (double-check in case another thread inserted while we + // were allocating) + { + std::unique_lock lock(workspace.mutex); + auto workspace_it = + workspace.map.try_emplace(key, std::move(new_workspace)).first; + return workspace_it->second.mutable_get(); } - return workspace_it->second.mutable_get(); } cublasHandle_t getCurrentCUDABlasHandle() { @@ -298,13 +358,8 @@ cublasHandle_t getCurrentCUDABlasHandle() { // will allocate memory dynamically (even if they're cheap) outside // PyTorch's CUDA caching allocator. It's possible that CCA used up // all the memory and cublas's cudaMallocAsync will return OOM - cudaStream_t _stream = stream; - auto key = std::make_tuple(static_cast(handle), static_cast(_stream)); - auto workspace_it = cublas_handle_stream_to_workspace().find(key); - if (workspace_it == cublas_handle_stream_to_workspace().end()) { - workspace_it = cublas_handle_stream_to_workspace().insert(workspace_it, {key, getNewWorkspace()}); - } - TORCH_CUDABLAS_CHECK(cublasSetWorkspace(handle, workspace_it->second.get(), getChosenWorkspaceSize())); + setWorkspaceForHandle(handle, stream); + #if !defined(USE_ROCM) // On CUDA >= 11, and architecture >= Ampere, cuBLAS can use TF32 to speedup // FP32 data type calculations based on the value of the allow_tf32 flag. diff --git a/aten/src/ATen/cuda/MemPool.cpp b/aten/src/ATen/cuda/MemPool.cpp new file mode 100644 index 0000000000000..99405965898e0 --- /dev/null +++ b/aten/src/ATen/cuda/MemPool.cpp @@ -0,0 +1,69 @@ +#include +#include + +namespace at::cuda { + +// uid_ is incremented when a user creates a MemPool, +// for example: using graph_pool_handle() or c10::cuda::MemPool(). +// +// uuid_ is incremented when CUDAGraph creates a MemPool +// as a result of a user not providing a pool. +// +// MempoolId_t of {0, 0} is used to denote when no MemPool has been +// passed to a function, either by user or CUDAGraphs. For example, +// default value of MempoolId_t for capture_begin function is {0, 0}. +// That's why uid_ and uuid_ start at 1. +std::atomic MemPool::uid_{1}; +std::atomic MemPool::uuid_{1}; + +MemPool::MemPool( + CUDACachingAllocator::CUDAAllocator* allocator, + bool is_user_created, + bool use_on_oom) + : allocator_(allocator), is_user_created_(is_user_created) { + if (is_user_created_) { + id_ = {0, uid_++}; + } else { + id_ = {uuid_++, 0}; + } + device_ = c10::cuda::current_device(); + CUDACachingAllocator::createOrIncrefPool(device_, id_, allocator); + if (use_on_oom) { + CUDACachingAllocator::setUseOnOOM(device_, id_); + } +} + +MemPool::~MemPool() { + // TORCH_INTERNAL_ASSERT(use_count() == 1); + // We used to assert that TORCH_INTERNAL_ASSERT(use_count() == 1); + // However, this assertion is not true if a memory pool is shared + // with a cuda graph. That CUDAGraph will increase the use count + // until it is reset. + CUDACachingAllocator::releasePool(device_, id_); + c10::cuda::CUDACachingAllocator::emptyCache(id_); +} + +MempoolId_t MemPool::id() { + return id_; +} + +CUDACachingAllocator::CUDAAllocator* MemPool::allocator() { + return allocator_; +} + +int MemPool::use_count() { + return CUDACachingAllocator::getPoolUseCount(device_, id_); +} + +c10::DeviceIndex MemPool::device() { + return device_; +} + +MempoolId_t MemPool::graph_pool_handle(bool is_user_created) { + if (is_user_created) { + return {0, uid_++}; + } + return {uuid_++, 0}; +} + +} // namespace at::cuda diff --git a/aten/src/ATen/cuda/MemPool.h b/aten/src/ATen/cuda/MemPool.h new file mode 100644 index 0000000000000..ba281c96b7043 --- /dev/null +++ b/aten/src/ATen/cuda/MemPool.h @@ -0,0 +1,44 @@ +#pragma once + +#include +#include + +namespace at::cuda { + +// Keep BC only +using c10::CaptureId_t; +using c10::MempoolId_t; + +// MemPool represents a pool of memory in a caching allocator. Currently, +// it's just the ID of the pool object maintained in the CUDACachingAllocator. +// +// An allocator pointer can be passed to the MemPool to define how the +// allocations should be done in the pool. For example: using a different +// system allocator such as ncclMemAlloc. +struct TORCH_CUDA_CPP_API MemPool { + MemPool( + c10::cuda::CUDACachingAllocator::CUDAAllocator* allocator = nullptr, + bool is_user_created = true, + bool use_on_oom = false); + MemPool(const MemPool&) = delete; + MemPool(MemPool&&) = default; + MemPool& operator=(const MemPool&) = delete; + MemPool& operator=(MemPool&&) = default; + ~MemPool(); + + MempoolId_t id(); + c10::cuda::CUDACachingAllocator::CUDAAllocator* allocator(); + int use_count(); + c10::DeviceIndex device(); + static MempoolId_t graph_pool_handle(bool is_user_created = true); + + private: + static std::atomic uid_; + static std::atomic uuid_; + c10::cuda::CUDACachingAllocator::CUDAAllocator* allocator_; + bool is_user_created_; + MempoolId_t id_; + c10::DeviceIndex device_; +}; + +} // namespace at::cuda diff --git a/aten/src/ATen/cuda/cub.h b/aten/src/ATen/cuda/cub.h index 7430edaf8a3dc..bca9b1faff523 100644 --- a/aten/src/ATen/cuda/cub.h +++ b/aten/src/ATen/cuda/cub.h @@ -24,7 +24,13 @@ namespace detail { // radix_sort_pairs doesn't interact with value_t other than to copy // the data, so we can save template instantiations by reinterpreting // it as an opaque type. +// We use native integer types for 1/2/4/8-byte values to reduce +// register usage in CUDA kernels. For sizes > 8 fall back to char array. template struct alignas(N) OpaqueType { char data[N]; }; +template <> struct alignas(1) OpaqueType<1> { uint8_t data; }; +template <> struct alignas(2) OpaqueType<2> { uint16_t data; }; +template <> struct alignas(4) OpaqueType<4> { uint32_t data; }; +template <> struct alignas(8) OpaqueType<8> { uint64_t data; }; template void radix_sort_pairs_impl( diff --git a/aten/src/ATen/cuda/detail/CUDAHooks.cpp b/aten/src/ATen/cuda/detail/CUDAHooks.cpp index b7f80101d926e..b2b9be4498e5b 100644 --- a/aten/src/ATen/cuda/detail/CUDAHooks.cpp +++ b/aten/src/ATen/cuda/detail/CUDAHooks.cpp @@ -21,6 +21,7 @@ #if AT_CUDNN_ENABLED() #include +#include #endif #if AT_MAGMA_ENABLED() @@ -351,6 +352,26 @@ long CUDAHooks::versionCuDNN() const { #endif } +long CUDAHooks::versionRuntimeCuDNN() const { +#if AT_CUDNN_ENABLED() +#ifndef USE_STATIC_CUDNN + return cudnnGetVersion(); +#else + return CUDNN_VERSION; +#endif +#else + TORCH_CHECK(false, "Cannot query CuDNN version if ATen_cuda is not built with CuDNN"); +#endif +} + +long CUDAHooks::versionCuDNNFrontend() const { +#if AT_CUDNN_ENABLED() + return CUDNN_FRONTEND_VERSION; +#else + TORCH_CHECK(false, "Cannot query CuDNN Frontend version if ATen_cuda is not built with CuDNN"); +#endif +} + long CUDAHooks::versionMIOpen() const { #if AT_ROCM_ENABLED() return MIOPEN_VERSION_MAJOR * 10000 + @@ -390,16 +411,16 @@ std::string CUDAHooks::showConfig() const { // HIP_VERSION value format was changed after ROCm v4.2 to include the patch number if(v < 500) { // If major=xx, minor=yy then format -> xxyy - oss << (v / 100) << "." << (v % 10); + oss << (v / 100) << '.' << (v % 10); } else { // If major=xx, minor=yy & patch=zzzzz then format -> xxyyzzzzz - oss << (v / 10000000) << "." << (v / 100000 % 100) << "." << (v % 100000); + oss << (v / 10000000) << '.' << (v / 100000 % 100) << '.' << (v % 100000); } #else - oss << (v / 1000) << "." << (v / 10 % 100); + oss << (v / 1000) << '.' << (v / 10 % 100); if (v % 10 != 0) { - oss << "." << (v % 10); + oss << '.' << (v % 10); } #endif }; @@ -410,16 +431,16 @@ std::string CUDAHooks::showConfig() const { oss << " - HIP Runtime "; #endif printCudaStyleVersion(runtimeVersion); - oss << "\n"; + oss << '\n'; // TODO: Make HIPIFY understand CUDART_VERSION macro #if !defined(USE_ROCM) if (runtimeVersion != CUDART_VERSION) { oss << " - Built with CUDA Runtime "; printCudaStyleVersion(CUDART_VERSION); - oss << "\n"; + oss << '\n'; } - oss << " - NVCC architecture flags: " << NVCC_FLAGS_EXTRA << "\n"; + oss << " - NVCC architecture flags: " << NVCC_FLAGS_EXTRA << '\n'; #endif #if !defined(USE_ROCM) @@ -427,9 +448,9 @@ std::string CUDAHooks::showConfig() const { auto printCudnnStyleVersion = [&](size_t v) { - oss << (v / 1000) << "." << (v / 100 % 10); + oss << (v / 1000) << '.' << (v / 100 % 10); if (v % 100 != 0) { - oss << "." << (v % 100); + oss << '.' << (v % 100); } }; @@ -440,22 +461,22 @@ std::string CUDAHooks::showConfig() const { if (cudnnCudartVersion != CUDART_VERSION) { oss << " (built against CUDA "; printCudaStyleVersion(cudnnCudartVersion); - oss << ")"; + oss << ')'; } - oss << "\n"; + oss << '\n'; if (cudnnVersion != CUDNN_VERSION) { oss << " - Built with CuDNN "; printCudnnStyleVersion(CUDNN_VERSION); - oss << "\n"; + oss << '\n'; } #endif #else // TODO: Check if miopen has the functions above and unify - oss << " - MIOpen " << MIOPEN_VERSION_MAJOR << "." << MIOPEN_VERSION_MINOR << "." << MIOPEN_VERSION_PATCH << "\n"; + oss << " - MIOpen " << MIOPEN_VERSION_MAJOR << '.' << MIOPEN_VERSION_MINOR << '.' << MIOPEN_VERSION_PATCH << '\n'; #endif #if AT_MAGMA_ENABLED() - oss << " - Magma " << MAGMA_VERSION_MAJOR << "." << MAGMA_VERSION_MINOR << "." << MAGMA_VERSION_MICRO << "\n"; + oss << " - Magma " << MAGMA_VERSION_MAJOR << '.' << MAGMA_VERSION_MINOR << '.' << MAGMA_VERSION_MICRO << '\n'; #endif return oss.str(); diff --git a/aten/src/ATen/cuda/detail/CUDAHooks.h b/aten/src/ATen/cuda/detail/CUDAHooks.h index 8d3d1db003928..8902c68d342f8 100644 --- a/aten/src/ATen/cuda/detail/CUDAHooks.h +++ b/aten/src/ATen/cuda/detail/CUDAHooks.h @@ -49,6 +49,8 @@ struct CUDAHooks : public at::CUDAHooksInterface { bool hasCUDART() const override; long versionCUDART() const override; long versionCuDNN() const override; + long versionRuntimeCuDNN() const override; + long versionCuDNNFrontend() const override; long versionMIOpen() const override; std::string showConfig() const override; double batchnormMinEpsilonCuDNN() const override; diff --git a/aten/src/ATen/cuda/jiterator.cu b/aten/src/ATen/cuda/jiterator.cu index 3af5104288d21..d664c828bdad6 100644 --- a/aten/src/ATen/cuda/jiterator.cu +++ b/aten/src/ATen/cuda/jiterator.cu @@ -42,7 +42,7 @@ static inline void launch_jitted_vectorized_kernel_dynamic( // The cache key includes all the parameters to generate_code + vec_size + dev_idx std::stringstream ss; - ss << nInputs << "_" << nOutputs << f; + ss << nInputs << '_' << nOutputs << f; ss << f_inputs_type_str << compute_type_str << result_type_str; ss << static_cast(at::cuda::jit::BinaryFuncVariant::NoScalar); ss << extra_args_types; @@ -144,7 +144,7 @@ static inline void launch_jitted_unrolled_kernel_dynamic( // The cache key includes all the parameters to generate_code + dev_idx std::stringstream ss; - ss << nInputs << "_" << nOutputs << f; + ss << nInputs << '_' << nOutputs << f; ss << f_inputs_type_str << compute_type_str << result_type_str; ss << contiguous << dynamic_casting; ss << static_cast(at::cuda::jit::BinaryFuncVariant::NoScalar); diff --git a/aten/src/ATen/cuda/tunable/Tunable.cpp b/aten/src/ATen/cuda/tunable/Tunable.cpp index 9fb04b40d30f6..eb7e381d27766 100644 --- a/aten/src/ATen/cuda/tunable/Tunable.cpp +++ b/aten/src/ATen/cuda/tunable/Tunable.cpp @@ -52,10 +52,10 @@ TuningContext* getTuningContext() { std::ostream& operator<<(std::ostream& stream, const ResultEntry& entry) { static const bool blaslog = c10::utils::get_env("PYTORCH_TUNABLEOP_BLAS_LOG") == "1"; if (!blaslog) { - return stream << entry.key_ << "," << entry.time_; + return stream << entry.key_ << ',' << entry.time_; } else { - return stream << entry.key_ << "," << entry.time_ << ",BLAS_PARAMS: " << entry.blas_sig_; + return stream << entry.key_ << ',' << entry.time_ << ",BLAS_PARAMS: " << entry.blas_sig_; } } @@ -156,10 +156,10 @@ void TuningResultsManager::RecordUntuned( std::ofstream& untuned_file, const std if (isNew) { static const bool blaslog = c10::utils::get_env("PYTORCH_TUNABLEOP_BLAS_LOG") == "1"; if (!blaslog) { - untuned_file << op_signature << "," << params_signature << std::endl; + untuned_file << op_signature << ',' << params_signature << std::endl; } else { - untuned_file << op_signature << "," << params_signature << ",BLAS_PARAMS: " << blas_signature << std::endl; + untuned_file << op_signature << ',' << params_signature << ",BLAS_PARAMS: " << blas_signature << std::endl; } TUNABLE_LOG3("Untuned,", op_signature, ",", params_signature); } @@ -201,7 +201,7 @@ void TuningResultsManager::InitRealtimeAppend(const std::string& filename, const if(!file_exists || file_empty) { for(const auto& [key, val] : validators) { - (*realtime_out_) << "Validator," << key << "," << val << std::endl; + (*realtime_out_) << "Validator," << key << ',' << val << std::endl; realtime_out_->flush(); } validators_written_ = true; @@ -219,7 +219,7 @@ void TuningResultsManager::AppendResultLine(const std::string& op_sig, const std return; } - (*realtime_out_) << op_sig << "," << param_sig << "," << result << std::endl; + (*realtime_out_) << op_sig << ',' << param_sig << ',' << result << std::endl; realtime_out_->flush(); //ensure immediate write to disk TUNABLE_LOG3("Realtime append: ", op_sig, "(", param_sig, ") -> ", result); diff --git a/aten/src/ATen/cudnn/Descriptors.cpp b/aten/src/ATen/cudnn/Descriptors.cpp index 8636d267209e9..a2cb0cb0a1025 100644 --- a/aten/src/ATen/cudnn/Descriptors.cpp +++ b/aten/src/ATen/cudnn/Descriptors.cpp @@ -93,31 +93,31 @@ std::string cudnnTypeToString(cudnnDataType_t dtype) { return "CUDNN_DATA_UINT8x4"; default: std::ostringstream oss; - oss << "(unknown data-type " << static_cast(dtype) << ")"; + oss << "(unknown data-type " << static_cast(dtype) << ')'; return oss.str(); } } std::ostream& operator<<(std::ostream & out, const TensorDescriptor& d) { - out << "TensorDescriptor " << static_cast(d.desc()) << "\n"; + out << "TensorDescriptor " << static_cast(d.desc()) << '\n'; int nbDims = 0; int dimA[CUDNN_DIM_MAX]; int strideA[CUDNN_DIM_MAX]; cudnnDataType_t dtype{}; cudnnGetTensorNdDescriptor(d.desc(), CUDNN_DIM_MAX, &dtype, &nbDims, dimA, strideA); - out << " type = " << cudnnTypeToString(dtype) << "\n"; - out << " nbDims = " << nbDims << "\n"; + out << " type = " << cudnnTypeToString(dtype) << '\n'; + out << " nbDims = " << nbDims << '\n'; // Read out only nbDims of the arrays! out << " dimA = "; for (auto i : ArrayRef{dimA, static_cast(nbDims)}) { out << i << ", "; } - out << "\n"; + out << '\n'; out << " strideA = "; for (auto i : ArrayRef{strideA, static_cast(nbDims)}) { out << i << ", "; } - out << "\n"; + out << '\n'; return out; } @@ -168,27 +168,27 @@ std::string cudnnMemoryFormatToString(cudnnTensorFormat_t tformat) { return "CUDNN_TENSOR_NHWC"; default: std::ostringstream oss; - oss << "(unknown cudnn tensor format " << static_cast(tformat) << ")"; + oss << "(unknown cudnn tensor format " << static_cast(tformat) << ')'; return oss.str(); } } std::ostream& operator<<(std::ostream & out, const FilterDescriptor& d) { - out << "FilterDescriptor " << static_cast(d.desc()) << "\n"; + out << "FilterDescriptor " << static_cast(d.desc()) << '\n'; int nbDims = 0; int dimA[CUDNN_DIM_MAX]; cudnnDataType_t dtype{}; cudnnTensorFormat_t tformat{}; cudnnGetFilterNdDescriptor(d.desc(), CUDNN_DIM_MAX, &dtype, &tformat, &nbDims, dimA); - out << " type = " << cudnnTypeToString(dtype) << "\n"; - out << " tensor_format = " << cudnnMemoryFormatToString(tformat) << "\n"; - out << " nbDims = " << nbDims << "\n"; + out << " type = " << cudnnTypeToString(dtype) << '\n'; + out << " tensor_format = " << cudnnMemoryFormatToString(tformat) << '\n'; + out << " nbDims = " << nbDims << '\n'; // Read out only nbDims of the arrays! out << " dimA = "; for (auto i : ArrayRef{dimA, static_cast(nbDims)}) { out << i << ", "; } - out << "\n"; + out << '\n'; return out; } diff --git a/aten/src/ATen/detail/CUDAHooksInterface.h b/aten/src/ATen/detail/CUDAHooksInterface.h index f1f2056917472..0ab8e82a30166 100644 --- a/aten/src/ATen/detail/CUDAHooksInterface.h +++ b/aten/src/ATen/detail/CUDAHooksInterface.h @@ -174,6 +174,14 @@ struct TORCH_API CUDAHooksInterface : AcceleratorHooksInterface { TORCH_CHECK(false, "Cannot query cuDNN version without ATen_cuda library. ", CUDA_HELP); } + virtual long versionRuntimeCuDNN() const { + TORCH_CHECK(false, "Cannot query cuDNN version without ATen_cuda library. ", CUDA_HELP); + } + + virtual long versionCuDNNFrontend() const { + TORCH_CHECK(false, "Cannot query cuDNN Frontend version without ATen_cuda library. ", CUDA_HELP); + } + virtual long versionMIOpen() const { TORCH_CHECK(false, "Cannot query MIOpen version without ATen_cuda library. ", CUDA_HELP); } diff --git a/aten/src/ATen/functorch/BatchedTensorImpl.h b/aten/src/ATen/functorch/BatchedTensorImpl.h index 985b289b3fe02..14be24d63e65a 100644 --- a/aten/src/ATen/functorch/BatchedTensorImpl.h +++ b/aten/src/ATen/functorch/BatchedTensorImpl.h @@ -157,6 +157,8 @@ constexpr DispatchKeySet kKeysToPropagateToWrapper({ DispatchKey::Negative, DispatchKey::Conjugate, DispatchKey::XLA, + DispatchKey::XPU, + DispatchKey::HPU, DispatchKey::CUDA, DispatchKey::CPU, DispatchKey::PrivateUse1, diff --git a/aten/src/ATen/functorch/DynamicLayer.cpp b/aten/src/ATen/functorch/DynamicLayer.cpp index 69af08a7bd7ce..518098a8b4a80 100644 --- a/aten/src/ATen/functorch/DynamicLayer.cpp +++ b/aten/src/ATen/functorch/DynamicLayer.cpp @@ -346,15 +346,15 @@ void foreachTensorInplaceWithFlag(std::vector& args, int64_t begin, int6 } std::ostream& operator<< (std::ostream& os, const DynamicLayer& layer) { - os << layer.layerId() << ":" << layer.key(); + os << layer.layerId() << ':' << layer.key(); return os; } std::ostream& operator<< (std::ostream& os, const std::vector& dls) { os << "DynamicLayerStack[ "; for (const auto& layer : dls) { - os << layer << " "; + os << layer << ' '; } - os << "]"; + os << ']'; return os; } diff --git a/aten/src/ATen/functorch/TensorWrapper.cpp b/aten/src/ATen/functorch/TensorWrapper.cpp index 65de9268927f0..ba5dcfc923878 100644 --- a/aten/src/ATen/functorch/TensorWrapper.cpp +++ b/aten/src/ATen/functorch/TensorWrapper.cpp @@ -22,7 +22,7 @@ void dumpTensor(std::ostream& ss, const Tensor& tensor) { if (batched) { ss << "Batched[lvl=" << batched->level() << " dim=" << batched->bdim() << ", "; dumpTensor(ss, batched->value()); - ss << "]"; + ss << ']'; return; } ss << "Tensor" << tensor.sizes(); @@ -36,7 +36,7 @@ void dumpTensor(std::ostream& ss, const Tensor& tensor) { ss << "dead, "; } dumpTensor(ss, wrapped->value()); - ss << "]"; + ss << ']'; } void TensorWrapper::refreshMetadata() { diff --git a/aten/src/ATen/miopen/Descriptors.cpp b/aten/src/ATen/miopen/Descriptors.cpp index 86e42ee3b66dc..3fe27c7a0825b 100644 --- a/aten/src/ATen/miopen/Descriptors.cpp +++ b/aten/src/ATen/miopen/Descriptors.cpp @@ -73,32 +73,32 @@ std::string miopenTypeToString(miopenDataType_t dtype) { return "miopenBFloat16"; default: std::ostringstream oss; - oss << "(unknown data-type " << static_cast(dtype) << ")"; + oss << "(unknown data-type " << static_cast(dtype) << ')'; return oss.str(); } } std::ostream& operator<<(std::ostream & out, const TensorDescriptor& d) { - out << "TensorDescriptor " << static_cast(d.desc()) << "\n"; + out << "TensorDescriptor " << static_cast(d.desc()) << '\n'; int nbDims = 0; int dimA[MIOPEN_DIM_MAX]; int strideA[MIOPEN_DIM_MAX]; miopenDataType_t dtype; miopenGetTensorDescriptorSize(d.desc(), &nbDims); miopenGetTensorDescriptor(d.desc(), &dtype, dimA, strideA); - out << " type = " << miopenTypeToString(dtype) << "\n"; - out << " nbDims = " << nbDims << "\n"; + out << " type = " << miopenTypeToString(dtype) << '\n'; + out << " nbDims = " << nbDims << '\n'; // Read out only nbDims of the arrays! out << " dimA = "; for (auto i : ArrayRef{dimA, static_cast(nbDims)}) { out << i << ", "; } - out << "\n"; + out << '\n'; out << " strideA = "; for (auto i : ArrayRef{strideA, static_cast(nbDims)}) { out << i << ", "; } - out << "\n"; + out << '\n'; return out; } diff --git a/aten/src/ATen/mps/MPSAllocator.mm b/aten/src/ATen/mps/MPSAllocator.mm index c8b3453fc81dd..dfdd67c8f4458 100644 --- a/aten/src/ATen/mps/MPSAllocator.mm +++ b/aten/src/ATen/mps/MPSAllocator.mm @@ -440,7 +440,7 @@ // we need to release the lock temporarily as synchronizing may cause deadlock with completion handlers. m_mutex.unlock(); auto stream = getDefaultMPSStream(); - dispatch_sync(stream->queue(), ^() { + dispatch_sync_with_rethrow(stream->queue(), ^() { stream->synchronize(SyncType::COMMIT_AND_WAIT); }); m_mutex.lock(); diff --git a/aten/src/ATen/mps/MPSProfiler.h b/aten/src/ATen/mps/MPSProfiler.h index c1cb9090fc4af..187e86d92e1bf 100644 --- a/aten/src/ATen/mps/MPSProfiler.h +++ b/aten/src/ATen/mps/MPSProfiler.h @@ -91,7 +91,7 @@ struct OperationInfo : BaseInfo { std::stringstream kernelStr; kernelStr << kernelName; for (const Tensor& tensor : tensors) { - kernelStr << ":" << BaseInfo::buildTensorString(tensor, includeBufferId); + kernelStr << ':' << BaseInfo::buildTensorString(tensor, includeBufferId); } return kernelStr.str(); } diff --git a/aten/src/ATen/mps/MPSProfiler.mm b/aten/src/ATen/mps/MPSProfiler.mm index a91574c56c52d..1d0408b8089c9 100644 --- a/aten/src/ATen/mps/MPSProfiler.mm +++ b/aten/src/ATen/mps/MPSProfiler.mm @@ -39,9 +39,9 @@ // see comments for INCLUDE_BUFFER_ID if (includeBufferId && deviceType == at::kMPS) { id buffer = __builtin_bit_cast(id, tensor.storage().data()); - tensorStr << "(buf#" << (getIMPSAllocator()->getBufferId(buffer)) << ":" << buffer.retainCount << ")"; + tensorStr << "(buf#" << (getIMPSAllocator()->getBufferId(buffer)) << ':' << buffer.retainCount << ')'; } - tensorStr << ":" << tensor.scalar_type() << tensor.sizes(); + tensorStr << ':' << tensor.scalar_type() << tensor.sizes(); return tensorStr.str(); } else { return "undefined"; diff --git a/aten/src/ATen/mps/MPSStream.h b/aten/src/ATen/mps/MPSStream.h index 10627cfc36b80..b00890b9f5901 100644 --- a/aten/src/ATen/mps/MPSStream.h +++ b/aten/src/ATen/mps/MPSStream.h @@ -110,6 +110,9 @@ class TORCH_API MPSStream { return _stream; } + MTLBuffer_t getErrorBuffer(); + void checkLastError(); + private: Stream _stream; MTLCommandQueue_t _commandQueue = nil; @@ -121,6 +124,8 @@ class TORCH_API MPSStream { dispatch_queue_t _serialQueue = nullptr; // CommitAndContinue is enabled by default bool _enableCommitAndContinue = true; + // Buffer that contains last raised error + MTLBuffer_t _errorBuffer = nil; // use synchronize() to access any of these commit functions outside MPSStream void commit(); @@ -155,4 +160,7 @@ class TORCH_API MPSStreamImpl { MPSStreamImpl(); }; +#ifdef __OBJC__ +void dispatch_sync_with_rethrow(dispatch_queue_t queue, void (^block)()); +#endif } // namespace at::mps diff --git a/aten/src/ATen/mps/MPSStream.mm b/aten/src/ATen/mps/MPSStream.mm index 595d71aeef15a..2150c21c18d75 100644 --- a/aten/src/ATen/mps/MPSStream.mm +++ b/aten/src/ATen/mps/MPSStream.mm @@ -3,13 +3,13 @@ #include #include #include +#include @interface MPSGraphExecutionDescriptor () @property(readwrite, atomic) BOOL enableCommitAndContinue; @end namespace at::mps { - //----------------------------------------------------------------- // MPSStream //----------------------------------------------------------------- @@ -30,6 +30,10 @@ @interface MPSGraphExecutionDescriptor () // Choose level which optimizes for GPU _compilationDescriptor.optimizationLevel = MPSGraphOptimizationLevel0; _executionDescriptor.compilationDescriptor = _compilationDescriptor; + + _errorBuffer = [MPSDevice::getInstance()->device() newBufferWithLength:sizeof(c10::metal::ErrorMessages) + options:MTLResourceStorageModeShared]; + std::memset([_errorBuffer contents], 0, 1024); } MPSStream::~MPSStream() { @@ -38,6 +42,8 @@ @interface MPSGraphExecutionDescriptor () [_executionDescriptor release]; [_compilationDescriptor release]; _executionDescriptor = nil; + [_errorBuffer release]; + _errorBuffer = nil; _compilationDescriptor = nil; assert(_commandBuffer == nil); @@ -104,6 +110,7 @@ @interface MPSGraphExecutionDescriptor () [_prevCommandBuffer waitUntilCompleted]; [_prevCommandBuffer release]; _prevCommandBuffer = nil; + checkLastError(); } if (_commandBuffer) { @@ -111,6 +118,7 @@ @interface MPSGraphExecutionDescriptor () [_commandBuffer waitUntilCompleted]; [_commandBuffer release]; _commandBuffer = nil; + checkLastError(); } } @@ -153,7 +161,7 @@ @interface MPSGraphExecutionDescriptor () if (length == 0) { return; } - dispatch_sync(_serialQueue, ^() { + dispatch_sync_with_rethrow(_serialQueue, ^() { @autoreleasepool { endKernelCoalescing(); id blitEncoder = [commandBuffer() blitCommandEncoder]; @@ -183,7 +191,7 @@ @interface MPSGraphExecutionDescriptor () size_t dstOffset, uint64_t profileId, SyncType syncType) { - dispatch_sync(_serialQueue, ^() { + dispatch_sync_with_rethrow(_serialQueue, ^() { @autoreleasepool { endKernelCoalescing(); id blitEncoder = [commandBuffer() blitCommandEncoder]; @@ -236,7 +244,7 @@ @interface MPSGraphExecutionDescriptor () auto& profiler = getMPSProfiler(); const bool isGraphProfilingEnabled = profiler.isOperationProfilingEnabled(); - dispatch_sync(_serialQueue, ^() { + dispatch_sync_with_rethrow(_serialQueue, ^() { endKernelCoalescing(); if (isGraphProfilingEnabled) { // this function call is only relevant for interval-based Signposts @@ -266,6 +274,24 @@ @interface MPSGraphExecutionDescriptor () }); } +id MPSStream::getErrorBuffer() { + return _errorBuffer; +} + +void MPSStream::checkLastError() { + auto msgs = reinterpret_cast([_errorBuffer contents]); + const auto& msg = msgs->msg[0]; + if (!msgs) { + return; + } + unsigned int count = 0; + std::swap(count, msgs->count); + if (!count) { + return; + } + throw c10::AcceleratorError({msg.func, msg.file, msg.line}, 1, msg.message); +} + //----------------------------------------------------------------- // MPSStreamImpl //----------------------------------------------------------------- @@ -289,4 +315,19 @@ @interface MPSGraphExecutionDescriptor () return MPSStreamImpl::getInstance(); } +// Helper methods +void dispatch_sync_with_rethrow(dispatch_queue_t queue, void (^block)()) { + __block std::optional block_exception; + dispatch_sync(queue, ^() { + try { + block(); + } catch (...) { + block_exception = std::current_exception(); + } + }); + if (block_exception) { + std::rethrow_exception(*block_exception); + } +} + } // namespace at::mps diff --git a/aten/src/ATen/native/BinaryOps.cpp b/aten/src/ATen/native/BinaryOps.cpp index f5d5edb6439a6..2fa6bcc6dc9ac 100644 --- a/aten/src/ATen/native/BinaryOps.cpp +++ b/aten/src/ATen/native/BinaryOps.cpp @@ -1009,12 +1009,25 @@ static Device correct_out_device(const Tensor& self, const Tensor& other) { } } +static Tensor send_to_meta(const Tensor& self, const Device& device) { + Tensor out_meta; + if (self._is_zerotensor() && self.unsafeGetTensorImpl()->is_wrapped_number()) { + out_meta = at::_efficientzerotensor(self.sizes(), self.options().device(device)); + out_meta.unsafeGetTensorImpl()->set_wrapped_number(true); + } else { + out_meta = self.to(device); + } + return out_meta; +} + Tensor mul_zerotensor(const Tensor& self, const Tensor& other) { auto out_device = correct_out_device(self, other); // hack to use the TensorIterator to get the correct broadcasting and type promotion logic auto device_ = Device(DeviceType::Meta); constexpr c10::DispatchKeySet meta_dks(at::DispatchKey::Meta); - auto meta_out = at::_ops::mul_Tensor::redispatch(meta_dks, self.to(device_), other.to(device_)); + auto self_meta = send_to_meta(self, device_); + auto other_meta = send_to_meta(other, device_); + auto meta_out = at::_ops::mul_Tensor::redispatch(meta_dks, self_meta, other_meta); return at::_efficientzerotensor(meta_out.sizes(), meta_out.options().device(out_device)); } @@ -1023,7 +1036,9 @@ Tensor div_zerotensor(const Tensor& self, const Tensor& other) { // hack to use the TensorIterator to get the correct broadcasting and type promotion logic auto device_ = Device(DeviceType::Meta); constexpr c10::DispatchKeySet meta_dks(at::DispatchKey::Meta); - auto meta_out = at::_ops::div_Tensor::redispatch(meta_dks, self.to(device_), other.to(device_)); + auto self_meta = send_to_meta(self, device_); + auto other_meta = send_to_meta(other, device_); + auto meta_out = at::_ops::div_Tensor::redispatch(meta_dks, self_meta, other_meta); if (self._is_zerotensor()) { if (other._is_zerotensor()) { @@ -1052,8 +1067,9 @@ static Tensor maybe_add_maybe_sub(const Tensor& self, const Tensor& other, const // hack to use the TensorIterator to get the correct broadcasting and type promotion logic auto device_ = Device(DeviceType::Meta); constexpr c10::DispatchKeySet meta_dks(at::DispatchKey::Meta); - auto meta_out = at::_ops::add_Tensor::redispatch( - meta_dks, self.to(device_), other.to(device_), alpha); + auto self_meta = send_to_meta(self, device_); + auto other_meta = send_to_meta(other, device_); + auto meta_out = at::_ops::add_Tensor::redispatch(meta_dks, self_meta, other_meta, alpha); auto get_out_like = [&] (const Tensor& tensor) { diff --git a/aten/src/ATen/native/ConvUtils.h b/aten/src/ATen/native/ConvUtils.h index 892144ac663a6..2a3388a052685 100644 --- a/aten/src/ATen/native/ConvUtils.h +++ b/aten/src/ATen/native/ConvUtils.h @@ -167,7 +167,7 @@ static void check_args(CheckedFrom c, IntArrayRef args, size_t expected_size, co std::stringstream ss; ss << arg_name << " should be greater than zero but got ("; std::copy(args.begin(), args.end() - 1, std::ostream_iterator(ss,", ")); - ss << args.back() << ")" << " (while checking arguments for " << c << ")"; + ss << args.back() << ")" << " (while checking arguments for " << c << ')'; TORCH_CHECK(false, ss.str()); } } diff --git a/aten/src/ATen/native/Convolution.cpp b/aten/src/ATen/native/Convolution.cpp index 2c3f14aab911c..cb37f6f1030d3 100644 --- a/aten/src/ATen/native/Convolution.cpp +++ b/aten/src/ATen/native/Convolution.cpp @@ -409,7 +409,7 @@ struct ConvParams { if (!detail::getCUDAHooks().compiledWithCuDNN() || !input.is_cuda() || !cudnn_enabled) { return false; } - static long cudnn_version = detail::getCUDAHooks().versionCuDNN(); + static long cudnn_version = detail::getCUDAHooks().versionRuntimeCuDNN(); // broken on cuDNN 9.8 - 9.14 if (cudnn_version >= 90800 && cudnn_version < 91500) { if (cudnn_conv_suggest_memory_format(input, weight) == at::MemoryFormat::Contiguous && @@ -453,7 +453,7 @@ struct ConvParams { } // native kernel doesn't support 64-bit non-splittable case if (!(canUse32BitIndexMath(input) && canUse32BitIndexMath(weight))) { - static long cudnn_version = detail::getCUDAHooks().compiledWithCuDNN() ? detail::getCUDAHooks().versionCuDNN() : -1; + static long cudnn_version = detail::getCUDAHooks().compiledWithCuDNN() ? detail::getCUDAHooks().versionRuntimeCuDNN() : -1; // TODO(eqy): remove this once cuDNN fixes 64-bit depthwise support, first broken in 9.11x if (cudnn_conv_suggest_memory_format(input, weight) != at::MemoryFormat::Contiguous) { if (cudnn_version < 0 || cudnn_version > 91000) { @@ -639,7 +639,7 @@ static std::ostream& operator<<(std::ostream & out, const ConvParams& params) << " deterministic = " << params.deterministic << " cudnn_enabled = " << params.cudnn_enabled << " allow_tf32 = " << params.allow_tf32 - << "}"; + << '}'; return out; } diff --git a/aten/src/ATen/native/Linear.cpp b/aten/src/ATen/native/Linear.cpp index 1da245972f0cb..fbabba84dbb2d 100644 --- a/aten/src/ATen/native/Linear.cpp +++ b/aten/src/ATen/native/Linear.cpp @@ -50,18 +50,35 @@ static inline bool parseLinearFlatten3d() { // `_flatten_nd_linear` flattens all but the last dimension of the input tensor // before passing it to linear operation static inline Tensor _flatten_nd_linear(const Tensor& input, const Tensor& weight, const Tensor& bias) { - const auto input_sizes = input.sym_sizes(); - // can't use -1 in reshape because it errors when a dimension is 0 - c10::SymInt flattened_dim = 1; - for (int64_t i = 0, ndim = input_sizes.size(); i < ndim - 1; ++i) { - flattened_dim = flattened_dim * input_sizes[i]; + const auto input_sizes = input.sym_sizes(); + + const auto result_flattened = [&]() -> Tensor { + const auto input_ncols = input_sizes.back(); + const auto input_flattened_nrows = [&]() -> c10::SymInt { + // can't use -1 in reshape because it errors when a dimension is 0 + auto flattened_nrows = c10::SymInt{1}; + for (const auto& size : input_sizes.slice(0, input_sizes.size() - 1)) { + flattened_nrows *= size; + } + return flattened_nrows; + }(); + + const auto input_flattened = input.view_symint({input_flattened_nrows, input_ncols}); + if (weight.layout() == c10::kStrided) { + return at::addmm(bias, input_flattened, weight.t()); + } else { + // weight is sparse, and addmm for sparse expects matmul lhs to be sparse, + // so we transpose the problem. + // NOTE: at::matmul handles (dense @ sparse) similarly. + const auto bias_t = (bias.dim() >= 2) ? bias.mT() : bias.unsqueeze(-1); + return at::addmm(bias_t, weight, input_flattened.t()).t(); } - auto inp_reshape = input.reshape_symint({flattened_dim, input_sizes.at(input_sizes.size() -1)}); - const auto result = at::addmm(bias, inp_reshape, weight.t()); - auto new_size = input_sizes.slice(0, input_sizes.size() - 1); - c10::SymDimVector sizes_vec(new_size.begin(), new_size.end()); - sizes_vec.push_back(result.sym_size(1)); - return result.view_symint(sizes_vec); + }(); + + // Unflatten flattened row dims + auto result_sizes = c10::SymDimVector{input_sizes.begin(), input_sizes.end()}; + result_sizes.back() = result_flattened.sym_size(1); + return result_flattened.view_symint(result_sizes); } @@ -90,15 +107,23 @@ Tensor linear(const Tensor& input, const Tensor& weight, const std::optionaldefined() && !input.is_xla()) { - // Also hit the fused path for contiguous 3D input, if not using xla + + const auto is_bias_likely_fusable = ( + bias->defined() && + // cuBLASLt: will fuse in the epilogue without copies + // when input/weight/bias are all strided. + // When weight is not strided, bias will not be fused, + // but we can still dispatch here to avoid at::matmul + // path which will probably use a very similar + // flattening optimization. + ((bias->dim() == 1 || bias->squeeze().dim() == 1) && bias->is_contiguous_or_false()) + ); + if (is_bias_likely_fusable && !input.is_xla()) { + // Also hit the fused path for contiguous nD input, if not using xla // backend. Reshaping/flattening has some performance implications on xla. - bool is_contiguous = input.is_contiguous_or_false(); - if (is_contiguous && input_dim == 3) { - return _flatten_nd_linear(input, weight, *bias); - } else if (is_contiguous && input.layout() == c10::kStrided && weight.layout() == c10::kStrided && bias->dim() == 1) { + if (input.is_contiguous_or_false()) { return _flatten_nd_linear(input, weight, *bias); - } else if (parseLinearFlatten3d() && input_dim == 3) { + } else if (parseLinearFlatten3d()) { // If user forces flattening via env var const Tensor input_cont = input.contiguous(); return _flatten_nd_linear(input_cont, weight, *bias); diff --git a/aten/src/ATen/native/PackedSequence.cpp b/aten/src/ATen/native/PackedSequence.cpp index d069108348d24..be7961b2a2452 100644 --- a/aten/src/ATen/native/PackedSequence.cpp +++ b/aten/src/ATen/native/PackedSequence.cpp @@ -142,6 +142,7 @@ Tensor _pack_padded_sequence_backward_symint(const Tensor& grad, c10::SymIntArra std::tuple _pad_packed_sequence(const Tensor& data, const Tensor& _batch_sizes, bool batch_first, const Scalar& padding_value, int64_t total_length) { auto batch_sizes_t = _batch_sizes.contiguous(); checkLongTensor(batch_sizes_t); + TORCH_CHECK(batch_sizes_t.numel() > 0, "batch_sizes can not be empty"); int64_t * batch_sizes = batch_sizes_t.data_ptr(); int64_t max_batch_size = batch_sizes[0]; diff --git a/aten/src/ATen/native/SpectralOps.cpp b/aten/src/ATen/native/SpectralOps.cpp index 79aaac48034ac..975e237c468d6 100644 --- a/aten/src/ATen/native/SpectralOps.cpp +++ b/aten/src/ATen/native/SpectralOps.cpp @@ -847,7 +847,7 @@ Tensor stft(const Tensor& self, const int64_t n_fft, const std::optional> indices; indices.resize(dim + 1); indices.set(dim, index); diff --git a/aten/src/ATen/native/TensorCompare.cpp b/aten/src/ATen/native/TensorCompare.cpp index c6126eda61e73..8a0b38eafab36 100644 --- a/aten/src/ATen/native/TensorCompare.cpp +++ b/aten/src/ATen/native/TensorCompare.cpp @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include @@ -479,6 +480,14 @@ Tensor isfinite(const Tensor& self) { }); } +void _async_error(std::string_view msg) { + TORCH_CHECK(0, msg); +} + +void _async_error_meta(std::string_view msg) { + // Do NOT error, it's an async error! +} + void _assert_async_cpu(const Tensor& self) { TORCH_CHECK( native::is_nonzero(self), @@ -514,7 +523,7 @@ Tensor _functional_assert_async_msg_cpu( } void _print(std::string_view s) { - std::cout << s << "\n"; + std::cout << s << '\n'; } // Sorting-based algorithm for isin(); used when the number of test elements is diff --git a/aten/src/ATen/native/TensorShape.cpp b/aten/src/ATen/native/TensorShape.cpp index 6df7761d822db..0079a530b3d0e 100644 --- a/aten/src/ATen/native/TensorShape.cpp +++ b/aten/src/ATen/native/TensorShape.cpp @@ -1,5 +1,6 @@ #include #include +#include #define TORCH_ASSERT_ONLY_METHOD_OPERATORS #include #include @@ -1710,11 +1711,37 @@ Tensor narrow_symint( "], but got ", start, ")") - if (start < 0) { - start = start + cur_size; - } + + auto cond1 = TORCH_GUARD_OR_FALSE(start.sym_lt(0)); + auto cond2 = TORCH_GUARD_OR_FALSE(start.sym_ge(0)); + + if (cond1 || cond2) { + if (cond1) { + start = start + cur_size; + } + + TORCH_SYM_CHECK( + start.sym_le(cur_size - length), + "start (", + start, + ") + length (", + length, + ") exceeds dimension size (", + cur_size, + ")."); + return at::slice_symint(self, dim, start, start + length, 1); + } + + // Unbacked start handling! + + // Bounds check without converting start: + // - If start < 0: need (start + cur_size) + length <= cur_size, i.e., start + + // length <= 0 + // - If start >= 0: need start + length <= cur_size + auto end = start + length; TORCH_SYM_CHECK( - start.sym_le(cur_size - length), + (start.sym_lt(0).sym_and((end).sym_le(0))) + .sym_or(start.sym_ge(0).sym_and((end).sym_le(cur_size))), "start (", start, ") + length (", @@ -1722,7 +1749,28 @@ Tensor narrow_symint( ") exceeds dimension size (", cur_size, ")."); - return at::slice_symint(self, dim, start, start + length, 1); + + if (TORCH_GUARD_OR_FALSE(end.sym_ne(0))) { + return at::slice_symint(self, dim, start, end, 1); + } else { + // Cannot statically determine the condition due to unbacked. + // This is an interesting situation; when start is negative and + // start + length == 0, slice and narrow do different things. + // i.e., x.narrow(0, -2, 2) != x[-2:0]; in that case, we want to + // pass curr_size instead of 0. Otherwise, they would do the same thing. + // This says at runtime: if start < 0 and end == 0, then pass curr_size + // instead of 0. + + auto use_different = start.sym_lt(0).sym_and(end.sym_eq(0)).toSymInt(); + auto result = + at::slice_symint(self, dim, start, end + use_different * cur_size, 1); + + // Ensure slice allocated unbacked size is specialized to length. + SymInt new_size = result.sym_size(dim); + TORCH_SYM_CHECK(new_size.sym_eq(length), "") + + return result; + } } // This overload exists purely for XLA, because they wanted to pass in @@ -1736,8 +1784,8 @@ Tensor narrow_tensor_symint( start.dim() == 0 && isIntegralType(start.scalar_type(), /*includeBool=*/false), "start must be an 0-dim integral Tensor."); - int64_t st = start.item(); - return at::narrow_symint(self, dim, c10::SymInt(st), std::move(length)); + c10::SymInt st = start.item().toSymInt(); + return at::narrow_symint(self, dim, std::move(st), std::move(length)); } std:: diff --git a/aten/src/ATen/native/TransposeType.h b/aten/src/ATen/native/TransposeType.h index 603bf6fee60aa..bb63e6d542482 100644 --- a/aten/src/ATen/native/TransposeType.h +++ b/aten/src/ATen/native/TransposeType.h @@ -1,6 +1,8 @@ #pragma once #include +C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wswitch-default") + namespace at::native { // Used as an interface between the different BLAS-like libraries @@ -21,3 +23,5 @@ static inline char to_blas(TransposeType trans) { } } // namespace at::native + +C10_DIAGNOSTIC_POP() diff --git a/aten/src/ATen/native/UnaryOps.cpp b/aten/src/ATen/native/UnaryOps.cpp index f849283043d37..acf14f3dfcdd5 100644 --- a/aten/src/ATen/native/UnaryOps.cpp +++ b/aten/src/ATen/native/UnaryOps.cpp @@ -904,19 +904,11 @@ Tensor mvlgamma(const Tensor& self, int64_t p) { return args.lgamma_().sum(-1).add_(p2_sub_p * std::log(c10::pi) * QUARTER); } +// since mvlgamma_ has different signature from its +// out and functional variant, we explicitly +// define it (instead of using structured kernel). Tensor& mvlgamma_(Tensor& self, int64_t p) { - mvlgamma_check(self, p); - Tensor args = native::arange( - -p *HALF + HALF, - HALF, - HALF, - optTypeMetaToScalarType(self.options().dtype_opt()), - self.options().layout_opt(), - self.options().device_opt(), - self.options().pinned_memory_opt()); - args = args.add(self.unsqueeze(-1)); - const auto p2_sub_p = static_cast(p * (p - 1)); - return self.copy_(args.lgamma_().sum(-1).add_(p2_sub_p * std::log(c10::pi) * QUARTER)); + return at::mvlgamma_out(self, self, p); } Tensor& mvlgamma_out(const Tensor& self, int64_t p, Tensor& result) { diff --git a/aten/src/ATen/native/cpu/GridSamplerKernel.cpp b/aten/src/ATen/native/cpu/GridSamplerKernel.cpp index 7587988528ebb..73f8c136794ce 100644 --- a/aten/src/ATen/native/cpu/GridSamplerKernel.cpp +++ b/aten/src/ATen/native/cpu/GridSamplerKernel.cpp @@ -293,7 +293,7 @@ struct ComputeLocationBase { , empty(size <= 0) {} inline Vec unnormalize(const Vec &in) const { - return (in + Vec(1)) * Vec(scaling_factor) - Vec(0.5); + return (in + Vec(static_cast(1))) * Vec(scaling_factor) - Vec(static_cast(0.5)); } inline Vec clip_coordinates(const Vec &in) const { @@ -831,7 +831,7 @@ struct ApplyGridSample(-0.75)); ApplyGridSample(const TensorAccessor& input) : inp_H(input.size(2)) diff --git a/aten/src/ATen/native/cuda/Blas.cpp b/aten/src/ATen/native/cuda/Blas.cpp index 186f7d8a6a78a..75a4d357a1c0b 100644 --- a/aten/src/ATen/native/cuda/Blas.cpp +++ b/aten/src/ATen/native/cuda/Blas.cpp @@ -147,14 +147,24 @@ static bool isGloballyDisabledAddmmCudaLt(const at::Device& device) { /* * Check whether for the given input we want to enable the Lt interface */ -static bool isInputCompliesAddmmCudaLt(Tensor& result, const Tensor& self, const Tensor& mat1, const Tensor& mat2, const Scalar& beta, const Scalar& alpha) { +static bool isInputCompliesAddmmCudaLt( + Tensor& result, + const Tensor& self, + const Tensor& mat1, + const Tensor& mat2, + const Scalar& beta, + const Scalar& alpha, + Activation activation +) { + #ifdef USE_ROCM // Implies 2D bias which we currently not send through Lt. // TODO: this check is done pre col-major input preparation, // so, this condition can be ralexed in cases when a col-major // copy of result is needed. - if (result.is_same(self)) { + if (self.is_same(result) || self.dim() == 2) { return false; } + #endif #if defined(USE_ROCM) && ROCM_VERSION == 60400 // hipblaslt TT fp32 regression on ROCm 6.4, cannot use @@ -169,13 +179,33 @@ static bool isInputCompliesAddmmCudaLt(Tensor& result, const Tensor& self, const #if defined(CUDA_VERSION) || defined(USE_ROCM) const auto scalar_type = mat1.scalar_type(); return (beta.toComplexDouble() == 1.0 + // NOTE: row-major result is important when bias is 1D. + // This is because Lt broadcasts 1D bias over the columns + // while the aten::addmm API broadcasts it over the rows, + // and this is in conjuction with the data preparation + // procedure that does not transpose arguments with + // col-major result. For col-major result we need + // to explicitly transpose the problem so that bias is + // correctly applied. + // TODO: enable col-major result if needed. + // TODO: no need to check result's layout when + // !result.is_same(self) and self.dim() == 2, because + // self needs to be copied into result and the bias ptr + // will be ignored. && result.dim() == 2 && result.is_contiguous() - // Conditions for bias to be fusable && ( - self.is_contiguous() && - // NOTE: fine to have 1-len dims to the left from the right-most one - (self.dim() == 1 || self.squeeze().dim() == 1) && - self.sizes().back() == mat2_sizes[1] + ( // Conditions for bias to be fusable -- implies direct Lt path without copies. + self.is_contiguous() && + // NOTE: fine to have 1-len dims to the left from the right-most one + (self.dim() == 1 || self.squeeze().dim() == 1) && + self.sizes().back() == mat2_sizes[1] + ) + || ( // 2D bias restrictions. self.is_contiguous() is implicit when result.is_same(self), + // and we need to copy self into result otherwise, so the self's layout becomes irrelevant. + // See also TODO from above. + activation != Activation::None && // Lt is faster when activation is fused + (self.dim() == 2 && at::is_expandable_to(self.sizes(), {mat1_sizes[0], mat2_sizes[1]})) + ) ) && ( // some dtype restrictions #ifndef USE_ROCM @@ -266,11 +296,16 @@ template bool launchGemmAndBiasCublasLt( // args contains result which is modified cublasCommonArgs& args, - const Tensor& self, + const std::optional& self, const Scalar& alpha, Activation activation = Activation::None ) { - const auto* self_ptr = self.const_data_ptr(); + // We apply bias in the epilogue only when it is 1D, + // or when it can be squeezed to 1D. + // self_ptr == nullptr implies ignore bias epilogue + // and use standard gemm-like API. + const auto* self_ptr = self.has_value() ? self.value().const_data_ptr() : static_cast(nullptr); + const auto tuning_ctx = at::cuda::tunable::getTuningContext(); if (tuning_ctx->IsTunableOpEnabled()) { @@ -353,34 +388,30 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma bool disable_addmm_cuda_lt = persistent_disable_addmm_cuda_lt || disable_addmm_cuda_lt_override; #ifdef USE_ROCM // Conditioned on the device index, which is not persistent - disable_addmm_cuda_lt = isGloballyDisabledAddmmCudaLt(self.device()) || disable_addmm_cuda_lt; + disable_addmm_cuda_lt = disable_addmm_cuda_lt || isGloballyDisabledAddmmCudaLt(self.device()); #endif // Condition on the input - disable_addmm_cuda_lt = !isInputCompliesAddmmCudaLt(result, self, mat1, mat2, beta, alpha) || disable_addmm_cuda_lt; - // } + disable_addmm_cuda_lt = disable_addmm_cuda_lt || !isInputCompliesAddmmCudaLt(result, self, mat1, mat2, beta, alpha, activation); at::ScalarType scalar_type = mat1.scalar_type(); bool is_float_output_with_half_input = (scalar_type == at::ScalarType::Half || scalar_type == at::ScalarType::BFloat16) && result.scalar_type() == at::ScalarType::Float; + #ifdef USE_ROCM + disable_addmm_cuda_lt = disable_addmm_cuda_lt || is_float_output_with_half_input; + #endif + + bool use_bias_ptr_lt = (self.dim() == 1) && !disable_addmm_cuda_lt; + // for float output with half input cublasLT with bias produces wrong results + use_bias_ptr_lt &= !is_float_output_with_half_input; + // Handle result/self shapes if (!result.is_same(self)) { at::native::resize_output(result, {mat1.sizes()[0], mat2.sizes()[1]}); - const auto self_maybe_expanded = [&]() -> c10::MaybeOwned { - if (disable_addmm_cuda_lt) { - // When in non-Lt path we do expand self even before - // check for beta != 0.0 to make sure that - // test_sparse_csr.py::TestSparseCSRCUDA::test_addmm_errors_* - // runs green. - return expand_size(self, result.sizes(), "addmm"); - } - // copy next, should broadcast - return c10::MaybeOwned::borrowed(self); - }(); - // We copy bias when in the non-Lt path - if (beta.toComplexDouble() != 0.0 && disable_addmm_cuda_lt) { + // We do not copy bias only when we need the bias ptr + if (beta.toComplexDouble() != 0.0 && !use_bias_ptr_lt) { // NOTE: self should broadcast over result - at::native::copy_(result, *self_maybe_expanded); + at::native::copy_(result, *expand_size(self, result.sizes(), "addmm")); } } @@ -428,7 +459,7 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma scalar_type, "addmm_cuda_lt", [&] { - lt_success = launchGemmAndBiasCublasLt(args, self, alpha, activation); + lt_success = launchGemmAndBiasCublasLt(args, use_bias_ptr_lt ? std::make_optional(self) : std::nullopt, alpha, activation); } ); #endif @@ -440,7 +471,7 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma scalar_type, "addmm_cuda_lt", [&] { - lt_success = launchGemmAndBiasCublasLt(args, self, alpha, activation); + lt_success = launchGemmAndBiasCublasLt(args, use_bias_ptr_lt ? std::make_optional(self) : std::nullopt, alpha, activation); } ); } // end is_float_output_with_half_input @@ -896,7 +927,7 @@ Tensor _int_mm_cuda(const Tensor& self, const Tensor& mat2) { return _int_mm_out_cuda(self, mat2, result); } -static void baddbmm_bmm_out_dtype_checks(const Tensor& batch1, const Tensor& batch2, const Scalar& beta, const Scalar& alpha, const at::ScalarType out_dtype, bool is_bmm, const std::optional& self_baddbmm = std::nullopt) { +static void baddbmm_bmm_out_dtype_checks(const Tensor& batch1, const Tensor& batch2, const Scalar& beta, const Scalar& alpha, const at::ScalarType out_dtype, const std::optional& self_baddbmm = std::nullopt) { // ref ATen/native/LinearAlgebra.cpp common_checks_baddbmm_bmm TORCH_CHECK(batch1.dim() == 3, "batch1 must be a 3D tensor"); TORCH_CHECK(batch2.dim() == 3, "batch2 must be a 3D tensor"); @@ -920,7 +951,7 @@ static void baddbmm_bmm_out_dtype_checks(const Tensor& batch1, const Tensor& bat (out_dtype == at::ScalarType::Float && (batch1.scalar_type() == at::ScalarType::Half || batch1.scalar_type() == at::ScalarType::BFloat16)), "out_dtype must be the same as input dtype or fp32 for fp16/bf16 inputs"); - if (!is_bmm && self_baddbmm.has_value()) { + if (self_baddbmm.has_value()) { const auto& self = self_baddbmm.value(); TORCH_CHECK(self.dim() == 3, "self must be a 3D tensor"); TORCH_CHECK(self.sizes() == output_size, "self must have the same shape as the output"); @@ -928,15 +959,12 @@ static void baddbmm_bmm_out_dtype_checks(const Tensor& batch1, const Tensor& bat } Tensor _bmm_dtype_cuda(const Tensor& batch1, const Tensor& batch2, const at::ScalarType out_dtype) { - IntArrayRef batch1_sizes = batch1.sizes(); - IntArrayRef batch2_sizes = batch2.sizes(); - - Tensor out = at::empty({batch1_sizes[0], batch1_sizes[1], batch2_sizes[2]}, batch1.options().dtype(out_dtype)); + Tensor out = at::empty({batch1.size(0), batch1.size(1), batch2.size(2)}, batch1.options().dtype(out_dtype)); return _bmm_out_dtype_cuda(batch1, batch2, out_dtype, out); } Tensor& _bmm_out_dtype_cuda(const Tensor& batch1, const Tensor& batch2, const at::ScalarType out_dtype, Tensor &out) { - baddbmm_bmm_out_dtype_checks(batch1, batch2, 0.0, 1.0, out_dtype, true); + baddbmm_bmm_out_dtype_checks(batch1, batch2, 0.0, 1.0, out_dtype); Scalar beta(0.0); Scalar alpha(1.0); { @@ -948,14 +976,16 @@ Tensor& _bmm_out_dtype_cuda(const Tensor& batch1, const Tensor& batch2, const at } Tensor _baddbmm_dtype_cuda(const Tensor& self, const Tensor& batch1, const Tensor& batch2, const at::ScalarType out_dtype, const Scalar& beta, const Scalar& alpha) { - // We need to copy the tensor - Tensor out = self.clone().to(self.options().dtype(out_dtype)); - - return _baddbmm_out_dtype_cuda(out, batch1, batch2, out_dtype, beta, alpha, out); + TORCH_CHECK(self.scalar_type() == out_dtype || self.scalar_type() == batch1.dtype(), + "self dtype must match either out_dtype or batch1 dtype"); + Tensor out = at::empty({batch1.size(0), batch1.size(1), batch2.size(2)}, batch1.options().dtype(out_dtype)); + return _baddbmm_out_dtype_cuda(self, batch1, batch2, out_dtype, beta, alpha, out); } Tensor& _baddbmm_out_dtype_cuda(const Tensor& self, const Tensor& batch1, const Tensor& batch2, const at::ScalarType out_dtype, const Scalar& beta, const Scalar& alpha, Tensor &out) { - baddbmm_bmm_out_dtype_checks(batch1, batch2, beta, alpha, out_dtype, false, self); + baddbmm_bmm_out_dtype_checks(batch1, batch2, beta, alpha, out_dtype, out); + // We need to copy the tensor + out.copy_(self); { NoNamesGuard guard; baddbmm_out_cuda_impl(out, out, batch1, batch2, beta, alpha); @@ -990,24 +1020,27 @@ Tensor& _mm_dtype_out_cuda(const Tensor& self, const Tensor& mat2, const at::Sca } Tensor _addmm_dtype_cuda(const Tensor& self, const Tensor& mat1, const Tensor& mat2, const at::ScalarType out_dtype, const Scalar& beta, const Scalar& alpha) { - Tensor result = at::empty(self.sizes(), self.options().dtype(out_dtype)); + TORCH_CHECK(mat1.dim() == 2, "mat1 must be a matrix, got ", mat1.dim(), "-D tensor"); + TORCH_CHECK(mat2.dim() == 2, "mat2 must be a matrix, got ", mat2.dim(), "-D tensor"); + Tensor result = at::empty({mat1.size(0), mat2.size(1)}, self.options().dtype(out_dtype)); return _addmm_dtype_out_cuda(self, mat1, mat2, out_dtype, beta, alpha, result); } Tensor& _addmm_dtype_out_cuda(const Tensor& self, const Tensor& mat1, const Tensor& mat2, const at::ScalarType out_dtype, const Scalar& beta, const Scalar& alpha, Tensor &out) { - TORCH_CHECK(self.scalar_type() == mat2.scalar_type(), "self and mat2 must have the same dtype, but got ", self.scalar_type(), " and ", mat2.scalar_type()); - TORCH_CHECK(mat1.scalar_type() == mat2.scalar_type(), "mat1 and mat2 must have the same dtype, but got ", mat1.scalar_type(), " and ", mat2.scalar_type()); +// repeat dimensionality checks for direct calls to `out` overload TORCH_CHECK(mat1.dim() == 2, "mat1 must be a matrix, got ", mat1.dim(), "-D tensor"); TORCH_CHECK(mat2.dim() == 2, "mat2 must be a matrix, got ", mat2.dim(), "-D tensor"); TORCH_CHECK( mat1.sizes()[1] == mat2.sizes()[0], "mat1 and mat2 shapes cannot be multiplied (", mat1.sizes()[0], "x", mat1.sizes()[1], " and ", mat2.sizes()[0], "x", mat2.sizes()[1], ")"); + TORCH_CHECK(mat1.scalar_type() == mat2.scalar_type(), "mat1 and mat2 must have the same dtype, but got ", mat1.scalar_type(), " and ", mat2.scalar_type()); + TORCH_CHECK(out_dtype == mat1.scalar_type() || + (out_dtype == at::ScalarType::Float && (mat1.scalar_type() == at::ScalarType::Half || mat1.scalar_type() == at::ScalarType::BFloat16)), + "out_dtype must be the same as input dtype or fp32 for fp16/bf16 inputs"); TORCH_CHECK(out_dtype == out.scalar_type(), "out_dtype must be the same as the dtype of the provided out tensor"); - TORCH_CHECK(out_dtype == self.scalar_type() || - (out_dtype == at::ScalarType::Float && (self.scalar_type() == at::ScalarType::Half || self.scalar_type() == at::ScalarType::BFloat16)), - "out_dtype must be the same as input dtype or fp32 for fp16/bf16 inputs"); - TORCH_CHECK(out_dtype == out.scalar_type(), "out_dtype must be the same as the dtype of the provided out tensor"); + TORCH_CHECK(out_dtype == self.scalar_type() || self.scalar_type() == mat1.scalar_type(), + "self dtype must match either out_dtype or mat1 dtype"); addmm_out_cuda_impl(out, self, mat1, mat2, beta, alpha); diff --git a/aten/src/ATen/native/cuda/CUDALoops.cuh b/aten/src/ATen/native/cuda/CUDALoops.cuh index c42d03b9cbf7f..b83ec3c761e9b 100644 --- a/aten/src/ATen/native/cuda/CUDALoops.cuh +++ b/aten/src/ATen/native/cuda/CUDALoops.cuh @@ -884,6 +884,69 @@ struct type_specialized_kernel_launcher { } }; +template +struct type_specialized_broadcast_kernel_launcher { + template < + typename func_t, + typename array_t, + typename dtypes_t, + typename calc_t> + static void apply( + int64_t numel, + func_t f, + array_t data, + dtypes_t dtypes, + calc_t offset_calc) { + using traits = function_traits; + using ret_t = typename traits::result_type; + using arg0_t = typename traits::template arg<0>::type; + using arg1_t = typename traits::template arg<1>::type; + if (dtypes[0] == rt_binary_specializations[arg_index][0] && + dtypes[1] == rt_binary_specializations[arg_index][1] && + dtypes[2] == rt_binary_specializations[arg_index][2]) { + using ret_cpp_t = c10::impl::ScalarTypeToCPPTypeT; + using arg0_cpp_t = c10::impl::ScalarTypeToCPPTypeT; + using arg1_cpp_t = c10::impl::ScalarTypeToCPPTypeT; + constexpr int grp_sz = 128; + launch_legacy_kernel_manual_unroll(numel, [=] GPU_LAMBDA(int idx, bool unrl) { + if (unrl) { + auto offsets0 = offset_calc.get(idx); + auto offsets1 = offset_calc.get(idx + grp_sz); + auto offsets2 = offset_calc.get(idx + grp_sz * 2); + auto offsets3 = offset_calc.get(idx + grp_sz * 3); + void* out0 = data[0] + offsets0[0]; + void* out1 = data[0] + offsets1[0]; + void* out2 = data[0] + offsets2[0]; + void* out3 = data[0] + offsets3[0]; + auto u = c10::load(data[1] + offsets0[1]); + auto v = c10::load(data[2] + offsets0[2]); + ret_t result0 = f(c10::convert(u), c10::convert(v)); + auto u1 = c10::load(data[1] + offsets1[1]); + auto v1 = c10::load(data[2]+ offsets1[2]); + ret_t result1 = f(c10::convert(u1), c10::convert(v1)); + auto u2 = c10::load(data[1] + offsets2[1]); + auto v2 = c10::load(data[2] + offsets2[2]); + ret_t result2 = f(c10::convert(u2), c10::convert(v2)); + auto u3 = c10::load(data[1] + offsets3[1]); + auto v3 = c10::load(data[2] + offsets3[2]); + ret_t result3 = f(c10::convert(u3), c10::convert(v3)); + *(ret_cpp_t*)out0 = c10::convert(result0); + *(ret_cpp_t*)out1 = c10::convert(result1); + *(ret_cpp_t*)out2 = c10::convert(result2); + *(ret_cpp_t*)out3 = c10::convert(result3); + } else { + auto offsets = offset_calc.get(idx); + void* out = data[0] + offsets[0]; + auto u = c10::load(data[1] + offsets[1]); + auto v = c10::load(data[2] + offsets[2]); + ret_t result = f(c10::convert(u), c10::convert(v)); + *(ret_cpp_t*)out = c10::convert(result); + } + }); + } + } +}; + } // namespace #endif @@ -1002,6 +1065,32 @@ void gpu_kernel_impl(TensorIteratorBase& iter, const func_t& f) { } auto offset_calc = ::make_offset_calculator(iter); #ifdef USE_ROCM + if (check_binary_rt_types_for_specialization(iter)) { + // constexpr to reduce the amount of kernels generated for + // broadcast elementwise with mexed dtypes and limit which functors are actually + // applied to the load and store at compile time. + using func_tuple = typename traits::ArgsTuple; + if constexpr ( + std::is_same_v && traits::arity == 2 && + check_binary_functor_types_for_specialization< + func_tuple, + float, + float, + traits::arity, + /*arg_num=*/0>::check()) { + memory::detail::static_unroll< + type_specialized_broadcast_kernel_launcher, + rt_binary_specializations.size()>::with_args( + numel, + f, + data, + dtypes, + offset_calc + ); + return; + } + } + constexpr int grp_sz = 128; launch_legacy_kernel_manual_unroll(numel, [=] GPU_LAMBDA(int idx, bool unrl) { if (unrl) { diff --git a/aten/src/ATen/native/cuda/CompositeRandomAccessor.h b/aten/src/ATen/native/cuda/CompositeRandomAccessor.h index d47a7fa776f1b..eb8587d1f9337 100644 --- a/aten/src/ATen/native/cuda/CompositeRandomAccessor.h +++ b/aten/src/ATen/native/cuda/CompositeRandomAccessor.h @@ -1,6 +1,7 @@ #pragma once #include +#include #include namespace at { namespace native { diff --git a/aten/src/ATen/native/cuda/DilatedMaxPool2d.cu b/aten/src/ATen/native/cuda/DilatedMaxPool2d.cu index 344906a2a4df2..88c552e9bf120 100644 --- a/aten/src/ATen/native/cuda/DilatedMaxPool2d.cu +++ b/aten/src/ATen/native/cuda/DilatedMaxPool2d.cu @@ -75,30 +75,52 @@ static inline bool can_use_int32_nhwc( return true; } +static inline bool can_use_int32_nchw( + int64_t nbatch, int64_t channels, + int64_t height, int64_t width, + int64_t pooled_height, int64_t pooled_width) { + int64_t hw = height * width; + return can_use_int32_nhwc( + nbatch, channels, height, width, + pooled_height, pooled_width, + channels * hw, // in_stride_n + hw, // in_stride_c + width, // in_stride_h + 1 // in_stride_w + ); +} + // kernels borrowed from Caffe -template -__global__ void max_pool_forward_nchw(const int nthreads, const scalar_t* bottom_data, - const int64_t channels, const int64_t height, - const int64_t width, const int pooled_height, const int pooled_width, - const int kernel_h, const int kernel_w, const int stride_h, - const int stride_w, const int pad_h, const int pad_w, - const int dilation_h, const int dilation_w, scalar_t* top_data, +template +__global__ void max_pool_forward_nchw( + const index_t nthreads, + const scalar_t* bottom_data, + const int64_t channels, + const int64_t height, + const int64_t width, + const int pooled_height, + const int pooled_width, + const int kernel_h, const int kernel_w, + const int stride_h, const int stride_w, + const int pad_h, const int pad_w, + const int dilation_h, const int dilation_w, + scalar_t* top_data, int64_t* top_mask) { - CUDA_KERNEL_LOOP(index, nthreads) { - int pw = index % pooled_width; - int ph = (index / pooled_width) % pooled_height; - int c = (index / pooled_width / pooled_height) % channels; - int n = index / pooled_width / pooled_height / channels; - int hstart = ph * stride_h - pad_h; - int wstart = pw * stride_w - pad_w; - int hend = min(hstart + (kernel_h - 1) * dilation_h + 1, height); - int wend = min(wstart + (kernel_w - 1) * dilation_w + 1, width); + CUDA_KERNEL_LOOP_TYPE(index, nthreads, index_t) { + index_t pw = index % pooled_width; + index_t ph = (index / pooled_width) % pooled_height; + index_t c = (index / pooled_width / pooled_height) % channels; + index_t n = index / pooled_width / pooled_height / channels; + index_t hstart = ph * stride_h - pad_h; + index_t wstart = pw * stride_w - pad_w; + index_t hend = min(hstart + (kernel_h - 1) * dilation_h + 1, height); + index_t wend = min(wstart + (kernel_w - 1) * dilation_w + 1, width); while(hstart < 0) hstart += dilation_h; while(wstart < 0) wstart += dilation_w; scalar_t maxval = at::numeric_limits::lower_bound(); // -Infinity - int maxidx = hstart * width + wstart; + index_t maxidx = hstart * width + wstart; const scalar_t* btm_data = bottom_data + (n * channels + c) * height * width; for (int h = hstart; h < hend; h += dilation_h) { for (int w = wstart; w < wend; w += dilation_w) { @@ -251,32 +273,39 @@ __global__ void max_pool_forward_nhwc( static constexpr int BLOCK_THREADS = 256; -template +template #if defined (USE_ROCM) C10_LAUNCH_BOUNDS_2(BLOCK_THREADS, 4) #else C10_LAUNCH_BOUNDS_2(BLOCK_THREADS, 8) #endif -__global__ void max_pool_backward_nchw(const scalar_t* top_diff, - const int64_t* top_mask, const int num, const int64_t channels, - const int64_t height, const int64_t width, const int pooled_height, - const int pooled_width, const int kernel_h, const int kernel_w, - const int stride_h, const int stride_w, const int pad_h, const int pad_w, +__global__ void max_pool_backward_nchw( + const scalar_t* top_diff, + const int64_t* top_mask, + const index_t num, + const index_t channels, + const index_t height, + const index_t width, + const index_t pooled_height, + const index_t pooled_width, + const int kernel_h, const int kernel_w, + const int stride_h, const int stride_w, + const int pad_h, const int pad_w, const int dilation_h, const int dilation_w, scalar_t* bottom_diff) { - CUDA_KERNEL_LOOP(index, height*width) { - int h = index / width; - int w = index - h * width; - int phstart = p_start(h, pad_h, kernel_h, dilation_h, stride_h); - int phend = p_end(h, pad_h, pooled_height, stride_h); - int pwstart = p_start(w, pad_w, kernel_w, dilation_w, stride_w); - int pwend = p_end(w, pad_w, pooled_width, stride_w); - for (int n = blockIdx.y; n < num; n += gridDim.y) { - for (int c = blockIdx.z; c < channels; c+= gridDim.z) { + CUDA_KERNEL_LOOP_TYPE(index, height*width, index_t) { + index_t h = index / width; + index_t w = index - h * width; + index_t phstart = p_start(h, pad_h, kernel_h, dilation_h, stride_h); + index_t phend = p_end(h, pad_h, pooled_height, stride_h); + index_t pwstart = p_start(w, pad_w, kernel_w, dilation_w, stride_w); + index_t pwend = p_end(w, pad_w, pooled_width, stride_w); + for (index_t n = blockIdx.y; n < num; n += gridDim.y) { + for (index_t c = blockIdx.z; c < channels; c += gridDim.z) { accscalar_t gradient = accscalar_t(0); - int offset = (n * channels + c) * pooled_height * pooled_width; - for (int ph = phstart; ph < phend; ++ph) { - for (int pw = pwstart; pw < pwend; ++pw) { + index_t offset = (n * channels + c) * pooled_height * pooled_width; + for (index_t ph = phstart; ph < phend; ++ph) { + for (index_t pw = pwstart; pw < pwend; ++pw) { if (top_mask[ph * pooled_width + pw + offset] == h * width + w) { gradient += static_cast(top_diff[ph * pooled_width + pw + offset]); } @@ -469,8 +498,6 @@ const Tensor& indices) { const int64_t in_stride_h = input.stride(-2); const int64_t in_stride_w = input.stride(-1); - const int count = safe_downcast(output.numel()); - AT_DISPATCH_FLOATING_TYPES_AND2(kHalf, kBFloat16, input.scalar_type(), "max_pool2d_with_indices_out_cuda_frame", [&] { @@ -553,14 +580,42 @@ const Tensor& indices) { break; } case MemoryFormat::Contiguous: { - const int num_threads = std::min(at::cuda::getCurrentDeviceProperties()->maxThreadsPerBlock, - BLOCK_THREADS); - max_pool_forward_nchw - <<>>( - count, input_data, - nInputPlane, inputHeight, inputWidth, outputHeight, outputWidth, - kH, kW, dH, dW, padH, padW, dilationH, dilationW, - output_data, indices_data); + const int threads = std::min( + at::cuda::getCurrentDeviceProperties()->maxThreadsPerBlock, + BLOCK_THREADS); + const int64_t nthreads = output.numel(); + bool use_int32 = can_use_int32_nchw( + nbatch, nInputPlane, inputHeight, inputWidth, outputHeight, outputWidth); + const int maxGridX = at::cuda::getCurrentDeviceProperties()->maxGridSize[0]; + const int blocks = static_cast(std::min( + ceil_div(nthreads, static_cast(threads)), + static_cast(maxGridX))); + auto stream = at::cuda::getCurrentCUDAStream(); + if (use_int32) { + max_pool_forward_nchw + <<>>( + static_cast(nthreads), + input_data, + static_cast(nInputPlane), + static_cast(inputHeight), + static_cast(inputWidth), + static_cast(outputHeight), + static_cast(outputWidth), + kH, kW, dH, dW, padH, padW, dilationH, dilationW, + output_data, indices_data); + } else { + max_pool_forward_nchw + <<>>( + nthreads, + input_data, + nInputPlane, + inputHeight, + inputWidth, + outputHeight, + outputWidth, + kH, kW, dH, dW, padH, padW, dilationH, dilationW, + output_data, indices_data); + } C10_CUDA_KERNEL_LAUNCH_CHECK(); break; } @@ -633,8 +688,6 @@ const Tensor& gradInput) { gradInput.zero_(); - int64_t count = input.numel(); - AT_DISPATCH_FLOATING_TYPES_AND2(kHalf, kBFloat16, input.scalar_type(), "max_pool2d_with_indices_out_cuda_frame", [&] { @@ -692,25 +745,45 @@ const Tensor& gradInput) { break; } case MemoryFormat::Contiguous: { - int imgcount = inputWidth * inputHeight; - dim3 grid; - const int blocks = (imgcount + BLOCK_THREADS - 1) / BLOCK_THREADS; - grid.x = blocks; - grid.y = nbatch; - uint64_t maxGridY = at::cuda::getCurrentDeviceProperties()->maxGridSize[1]; - if (maxGridY < grid.y) grid.y = maxGridY; - grid.z = nInputPlane; - uint64_t maxGridZ = at::cuda::getCurrentDeviceProperties()->maxGridSize[2]; - if (maxGridZ < grid.z) grid.z = maxGridZ; - - max_pool_backward_nchw - <<>>( - gradOutput_data, - indices_data, - nbatch, - nInputPlane, inputHeight, inputWidth, outputHeight, outputWidth, - kH, kW, dH, dW, padH, padW, dilationH, dilationW, - gradInput_data); + const int threads = std::min( + at::cuda::getCurrentDeviceProperties()->maxThreadsPerBlock, + BLOCK_THREADS); + const int imgcount = inputWidth * inputHeight; + const int maxGridX = at::cuda::getCurrentDeviceProperties()->maxGridSize[0]; + const int maxGridY = at::cuda::getCurrentDeviceProperties()->maxGridSize[1]; + const int maxGridZ = at::cuda::getCurrentDeviceProperties()->maxGridSize[2]; + const int blocks_x = std::min(ceil_div(imgcount, threads), maxGridX); + dim3 grid(blocks_x, static_cast(std::min(nbatch, maxGridY)), static_cast(std::min(nInputPlane, maxGridZ))); + bool use_int32 = can_use_int32_nchw( + nbatch, nInputPlane, inputHeight, inputWidth, outputHeight, outputWidth); + auto stream = at::cuda::getCurrentCUDAStream(); + if (use_int32) { + max_pool_backward_nchw + <<>>( + gradOutput_data, + indices_data, + static_cast(nbatch), + static_cast(nInputPlane), + static_cast(inputHeight), + static_cast(inputWidth), + static_cast(outputHeight), + static_cast(outputWidth), + kH, kW, dH, dW, padH, padW, dilationH, dilationW, + gradInput_data); + } else { + max_pool_backward_nchw + <<>>( + gradOutput_data, + indices_data, + nbatch, + nInputPlane, + inputHeight, + inputWidth, + outputHeight, + outputWidth, + kH, kW, dH, dW, padH, padW, dilationH, dilationW, + gradInput_data); + } C10_CUDA_KERNEL_LAUNCH_CHECK(); break; } diff --git a/aten/src/ATen/native/cuda/EmbeddingBag.cu b/aten/src/ATen/native/cuda/EmbeddingBag.cu index ab3747df031eb..9af8abcf3cf82 100644 --- a/aten/src/ATen/native/cuda/EmbeddingBag.cu +++ b/aten/src/ATen/native/cuda/EmbeddingBag.cu @@ -78,9 +78,18 @@ __global__ void EmbeddingBag_updateOutputKernel_max( scalar_t weightFeatMax = 0; int64_t bag_size_ = 0; int64_t maxWord = -1; + + // Separate validation loop reduces register pressure in the main loop below. + // No early exit (break) on invalid input as benchmarking shows it degrades performance. + bool has_invalid_index = false; + for (int64_t emb = begin; emb < end; emb++) { + index_t input_idx = input[emb]; + has_invalid_index = has_invalid_index || (input_idx < 0 || input_idx >= numRows); + } + CUDA_KERNEL_ASSERT(!has_invalid_index && "Invalid input index in EmbeddingBag: index out of range [0, numRows)"); + for (int64_t emb = begin; emb < end; emb++) { bool pad = (input[emb] == padding_idx); - CUDA_KERNEL_ASSERT(input[emb] < numRows); const int64_t weightRow = input[emb] * weight_stride0; scalar_t weightValue = weightFeat[weightRow]; if (bag_size_ == 0 || weightValue > weightFeatMax) { @@ -129,10 +138,19 @@ __global__ void EmbeddingBag_updateOutputKernel_sum_mean( CUDA_KERNEL_ASSERT(end >= begin); accscalar_t weightFeatSum = 0; int64_t bag_size_ = 0; + + // Separate validation loop reduces register pressure in the main loop below. + // No early exit (break) on invalid input as benchmarking shows it degrades performance. + bool has_invalid_index = false; + for (int64_t emb = begin; emb < end; emb++) { + index_t input_idx = input[emb]; + has_invalid_index = has_invalid_index || (input_idx < 0 || input_idx >= numRows); + } + CUDA_KERNEL_ASSERT(!has_invalid_index && "Invalid input index in EmbeddingBag: index out of range [0, numRows)"); + for (int64_t emb = begin; emb < end; emb++) { index_t input_idx = input[emb]; bool pad = (input_idx == padding_idx); - CUDA_KERNEL_ASSERT(0 <= input_idx && input_idx < numRows); const int64_t weightRow = input_idx * weight_stride0; scalar_t weightValue = weightFeat[weightRow]; weightValue = pad ? static_cast(0) : weightValue; diff --git a/aten/src/ATen/native/cuda/GroupedBlas.cpp b/aten/src/ATen/native/cuda/GroupedBlas.cpp index f64eb317d0cca..f4b229156d79f 100644 --- a/aten/src/ATen/native/cuda/GroupedBlas.cpp +++ b/aten/src/ATen/native/cuda/GroupedBlas.cpp @@ -22,6 +22,9 @@ #include #include #include +#ifdef USE_ROCM +#include +#endif #include #ifdef USE_FBGEMM_GENAI @@ -75,9 +78,9 @@ _mx8_mx8_bf16_grouped_mm_fbgemm( const Tensor& mat_a, const Tensor& mat_b, const Tensor& scale_a, - const SwizzleType& swizzle_a, + const SwizzleType swizzle_a, const Tensor& scale_b, - const SwizzleType& swizzle_b, + const SwizzleType swizzle_b, const std::optional& offs, Tensor& out) { const bool a_is_2d = mat_a.dim() == 2; @@ -604,6 +607,8 @@ _scaled_grouped_mm_cuda_v2( // scale shape checks _check_scales_blocked(mat_a, scale_a[0], 0 /* dim */, 0 /* arg_idx */); _check_scales_blocked(mat_b, scale_b[0], 1 /* dim */, 1 /* arg_idx */); + // swizze checks + TORCH_CHECK_VALUE(swizzle_a_enum.size() == 1 && swizzle_b_enum.size() == 1, "Expected single swizzle argument"); return _mx8_mx8_bf16_grouped_mm_fbgemm( mat_a, mat_b, @@ -666,12 +671,26 @@ std::optional out_dtype) { // _scaled_mm_allowed_device is used here within _grouped_mm_cuda which seems incorrect since scale is not used. // the _grouped_mm_fallback should be safe for any ROCm GPU since it's just calling typical mm/bmm bool use_fast_path = false; + // On non CK system(w/ ROCm), make sure use_fast_path is false +#if defined(USE_ROCM_CK_GEMM) + if (at::detail::getCUDAHooks().isGPUArch({"gfx942", "gfx950"})) { + use_fast_path = true; + } +#endif //USE_ROCM_CK_GEMM #endif const auto out_dtype_ = _resolve_grouped_mm_out_dtype(mat_a, mat_b, out_dtype); Tensor out = create_grouped_gemm_output_tensor(mat_a, mat_b, offs, out_dtype_); if (use_fast_path) { // fast path, no d2h sync needed +#ifndef USE_ROCM at::cuda::detail::bf16bf16_grouped_mm(mat_a, mat_b, offs, bias, out); +#else +#if defined(USE_ROCM_CK_GEMM) + at::hip::detail::group_gemm_ck(mat_a, mat_b, offs, bias, out); +#else + TORCH_WARN("ROCm: Group Gemm through CK not selected."); +#endif //USE_ROCM_CK_GEMM +#endif } else { _grouped_mm_fallback(mat_a, mat_b, offs, bias, out_dtype, out); } diff --git a/aten/src/ATen/native/cuda/IndexKernel.cu b/aten/src/ATen/native/cuda/IndexKernel.cu index 927af661396cd..db85f62c8d124 100644 --- a/aten/src/ATen/native/cuda/IndexKernel.cu +++ b/aten/src/ATen/native/cuda/IndexKernel.cu @@ -5,7 +5,6 @@ #include #include #include -#include #include #include #include @@ -74,7 +73,6 @@ void gpu_index_kernel(TensorIteratorBase& iter, const IntArrayRef index_size, co char* const out_ptr = static_cast(iter.data_ptr(0)); char* const in_ptr = static_cast(iter.data_ptr(1)); - if (is_gather_like && num_indices==1) { const size_t element_size = iter.element_size(0); constexpr size_t alignment = 16; @@ -84,16 +82,9 @@ void gpu_index_kernel(TensorIteratorBase& iter, const IntArrayRef index_size, co auto ind_dim_size = index_size[0]; auto inp_stride_bytes = index_stride[0]; auto out_stride_bytes = iter.strides(0)[1]; - // avoid grid overflow in the fast kernel - const int64_t vec_chunks = ceil_div(slice_size, alignment); - const int64_t blocks_per_slice_upper = ceil_div(vec_chunks, (int64_t)launch_size_nd); - const int max_grid_y = at::cuda::getCurrentDeviceProperties()->maxGridSize[1]; - // if it's an eligible grid we use the fast path, otherwise default to slower path - if (blocks_per_slice_upper <= max_grid_y) { - at::native::vectorized_gather_kernel_launch(out_ptr, in_ptr, (int64_t*)iter.data_ptr(2), num_ind, - slice_size, ind_dim_size, inp_stride_bytes, out_stride_bytes, /*allow_neg_indices*/true); - return; - } + at::native::vectorized_gather_kernel_launch(out_ptr, in_ptr, (int64_t*)iter.data_ptr(2), num_ind, + slice_size, ind_dim_size, inp_stride_bytes, out_stride_bytes, /*allow_neg_indices*/true); + return; } } diff --git a/aten/src/ATen/native/cuda/IndexKernelUtils.cu b/aten/src/ATen/native/cuda/IndexKernelUtils.cu index 8343c60418952..1e998251dd7be 100644 --- a/aten/src/ATen/native/cuda/IndexKernelUtils.cu +++ b/aten/src/ATen/native/cuda/IndexKernelUtils.cu @@ -13,11 +13,12 @@ __global__ void vectorized_gather_kernel(char * out, char * inp, index_t * idx, if (allow_neg_indices) { ind = (ind < 0) ? ind + ind_dim_size : ind; } - CUDA_KERNEL_ASSERT_VERBOSE(ind >=0 && ind < ind_dim_size && "vectorized gather kernel index out of bounds", "Expected 0 <= index < ind_dim_size(%ld), but got index = %ld", ind_dim_size, ind); - int32_t off = (blockDim.x * blockIdx.y + threadIdx.x) * Alignment; // off is guaranteed to be within int32 limits - if (off >= slice_size) return; - auto vec = at::native::memory::ld_vec(inp + ind * inp_stride + off); - at::native::memory::st_vec(out + blockIdx.x * (int32_t)out_stride + off, vec); // out offset is guaranteed to be within int32 limits + CUDA_KERNEL_ASSERT_VERBOSE(ind >=0 && ind < ind_dim_size && "vectorized gather kernel index out of bounds"); + // off is guaranteed to be within int32 limits + for (int32_t off = (blockDim.x * blockIdx.y + threadIdx.x) * Alignment; off < slice_size; off += blockDim.x * gridDim.y * Alignment) { + auto vec = at::native::memory::ld_vec(inp + ind * inp_stride + off); + at::native::memory::st_vec(out + blockIdx.x * (int32_t)out_stride + off, vec); // out offset is guaranteed to be within int32 limits + } } @@ -30,7 +31,9 @@ void vectorized_gather_kernel_launch(char * out, char * inp, index_t * idx, int auto num_threads = at::round_up( at::ceil_div(slice_size_in_bytes, Alignment), static_cast(C10_WARP_SIZE)); - dim3 grid = {static_cast(num_ind), static_cast(at::ceil_div(slice_size_in_bytes, max_num_threads * Alignment)), 1}; + uint32_t grid_y = at::cuda::getCurrentDeviceProperties()->maxGridSize[1]; + grid_y = std::min(static_cast(at::ceil_div(slice_size_in_bytes, max_num_threads * Alignment)), grid_y); + dim3 grid = {static_cast(num_ind), grid_y, 1}; auto block = std::min(max_num_threads, num_threads); vectorized_gather_kernel<<>>(out, inp, idx, num_ind, slice_size_in_bytes, ind_dim_size, inp_stride_bytes, out_stride_bytes, allow_neg_indices); diff --git a/aten/src/ATen/native/cuda/KernelUtils.cuh b/aten/src/ATen/native/cuda/KernelUtils.cuh index 5c8b98105bb26..fd406829707a1 100644 --- a/aten/src/ATen/native/cuda/KernelUtils.cuh +++ b/aten/src/ATen/native/cuda/KernelUtils.cuh @@ -5,69 +5,11 @@ #include #endif -// ROCm 6.3 is planned to have these functions, but until then here they are. #if defined(USE_ROCM) #include #include #include - -__device__ inline __hip_bfloat162 preview_unsafeAtomicAdd(__hip_bfloat162* address, __hip_bfloat162 value) { -#if (defined(__gfx942__)) && \ - __has_builtin(__builtin_amdgcn_flat_atomic_fadd_v2bf16) - typedef unsigned short __attribute__((ext_vector_type(2))) vec_short2; - static_assert(sizeof(vec_short2) == sizeof(__hip_bfloat162_raw)); - union { - __hip_bfloat162_raw bf162_raw; - vec_short2 vs2; - } u{static_cast<__hip_bfloat162_raw>(value)}; - u.vs2 = __builtin_amdgcn_flat_atomic_fadd_v2bf16((vec_short2*)address, u.vs2); - return static_cast<__hip_bfloat162>(u.bf162_raw); -#else - static_assert(sizeof(unsigned int) == sizeof(__hip_bfloat162_raw)); - union u_hold { - __hip_bfloat162_raw h2r; - unsigned int u32; - }; - u_hold old_val, new_val; - old_val.u32 = __hip_atomic_load((unsigned int*)address, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); - do { - new_val.h2r = __hadd2(old_val.h2r, value); - } while (!__hip_atomic_compare_exchange_strong( - (unsigned int*)address, &old_val.u32, new_val.u32, - __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT)); - return old_val.h2r; -#endif -} - -__device__ inline __half2 preview_unsafeAtomicAdd(__half2* address, __half2 value) { -#if (defined(__gfx942__)) && \ - __has_builtin(__builtin_amdgcn_flat_atomic_fadd_v2f16) - // The api expects an ext_vector_type of half - typedef _Float16 __attribute__((ext_vector_type(2))) vec_fp162; - static_assert(sizeof(vec_fp162) == sizeof(__half2_raw)); - union { - __half2_raw h2r; - vec_fp162 fp16; - } u {static_cast<__half2_raw>(value)}; - u.fp16 = __builtin_amdgcn_flat_atomic_fadd_v2f16((vec_fp162*)address, u.fp16); - return static_cast<__half2>(u.h2r); -#else - static_assert(sizeof(__half2_raw) == sizeof(unsigned int)); - union u_hold { - __half2_raw h2r; - unsigned int u32; - }; - u_hold old_val, new_val; - old_val.u32 = __hip_atomic_load((unsigned int*)address, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); - do { - new_val.h2r = __hadd2(old_val.h2r, value); - } while (!__hip_atomic_compare_exchange_strong( - (unsigned int*)address, &old_val.u32, new_val.u32, - __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT)); - return old_val.h2r; -#endif -} -#define ATOMICADD preview_unsafeAtomicAdd +#define ATOMICADD unsafeAtomicAdd #define NATIVE_ZERO_BF16 __float2bfloat16(0.0f) #else #define ATOMICADD atomicAdd diff --git a/aten/src/ATen/native/cuda/LogAddExpKernel.cu b/aten/src/ATen/native/cuda/LogAddExpKernel.cu index 7b8b5b5bb2032..910d3c1cddc93 100644 --- a/aten/src/ATen/native/cuda/LogAddExpKernel.cu +++ b/aten/src/ATen/native/cuda/LogAddExpKernel.cu @@ -2,18 +2,250 @@ #include #include #include +#include +#include +#include #include #include #include #include +#include + +#include +#include // NOTE: CUDA on Windows requires that the enclosing function // of a __device__ lambda not have internal linkage. namespace at::native { +// custom min and max to be used in logaddexp for complex arguments +template +__host__ __device__ c10::complex _logaddexp_minmax(const c10::complex& x, const c10::complex& y) { + scalar_t xr = std::real(x); + scalar_t yr = std::real(y); + if (::isnan(yr) || (::isnan(std::imag(y)))) { + return y; + } else if (::isnan(xr) || (::isnan(std::imag(x)))) { + return x; + } else if (min) { // min + return (xr < yr) ? x : y; + } else { // max + return (xr >= yr) ? x : y; + } +} + +template +__host__ __device__ scalar_t _log_add_exp_helper(const scalar_t& x, const scalar_t& y) { + // Reference : https://www.tensorflow.org/api_docs/python/tf/math/cumulative_logsumexp + // Using the original expression: `at::_isnan(y) ? y : std::min(x, y)` causes an error in ROCM + const auto isnan_x = at::_isnan(x); + const auto isnan_y = at::_isnan(y); + scalar_t min = isnan_y ? y : (isnan_x ? x : std::min(x, y)); + scalar_t max = isnan_y ? y : (isnan_x ? x : std::max(x, y)); + if (min != max || ::isfinite(min)) { + // nan will be propagated here + return ::log1p(std::exp(min - max)) + max; + } else { + // special case to correctly handle infinite cases + return x; + } +} + +template +__host__ __device__ c10::complex _fast_build_exp(const c10::complex& x) { + // complex exponential function, but implemented manually to get fast compilation time + // this function only handles the case where the x is finite (not inf nor nan) + const auto xreal = std::real(x); + const auto ximag = std::imag(x); + const auto exp_x_abs = std::exp(xreal); + auto exp_x_real = exp_x_abs * std::cos(ximag); + auto exp_x_imag = exp_x_abs * std::sin(ximag); + return {exp_x_real, exp_x_imag}; +} + +template +__host__ __device__ c10::complex _fast_build_exp_inf(const c10::complex& x) { + // complex exponential function, but implemented manually to get fast compilation time + // this function only handles the case where the real part of x is infinite + const auto ximag = std::imag(x); + constexpr auto exp_x_abs = std::numeric_limits::infinity(); + if (!::isfinite(ximag)) { // add this to make consitent with std::exp(x+yi) + return {exp_x_abs, std::numeric_limits::quiet_NaN()}; + } + const auto sin = std::sin(ximag); + const auto cos = std::cos(ximag); + // special case if the angle is exactly the multiple of pi/2 + auto exp_x_real = (cos == 0) ? (scalar_t)0.0 : exp_x_abs * cos; + auto exp_x_imag = (sin == 0) ? (scalar_t)0.0 : exp_x_abs * sin; + return {exp_x_real, exp_x_imag}; +} + +template +__host__ __device__ c10::complex _log_add_exp_helper(const c10::complex& x, const c10::complex& y) { + c10::complex min = _logaddexp_minmax(x, y); + c10::complex max = _logaddexp_minmax(x, y); + scalar_t min_real = std::real(min); + scalar_t max_real = std::real(max); + + if (::isnan(min_real) || ::isnan(std::imag(min))) { + // handling the "infectious" NaNs + return {std::numeric_limits::quiet_NaN(), std::numeric_limits::quiet_NaN()}; + } + else if ((!::isfinite(min_real)) && (min_real == max_real)) { + if (min_real < 0) { + // handle the -inf case, the imaginary part here does not really matter as the exp(value) + // will be around 0.0 and the angle (i.e. the imaginary part) cannot be determined. + // It does not matter if we're taking the exp of this value + return min; + } else { + // handle the +inf case, we don't need the special precision for log1p for small values + // and to avoid producing nan in case of real(max) == real(min) == +inf + const auto exp_min = _fast_build_exp_inf(min); + const auto exp_max = _fast_build_exp_inf(max); + return ::log1p(exp_min + exp_max - 1); // log1p(x - 1) builds faster than log + } + } else { + const auto minmax = min - max; + c10::complex exp_minmax; + if (!::isfinite(minmax.real())) { + exp_minmax = minmax.real() < 0 ? c10::complex{0.0, 0.0} : _fast_build_exp_inf(minmax); + } else { + exp_minmax = _fast_build_exp(minmax); + } + return ::log1p(exp_minmax) + max; + } +} + +// Complex logaddexp jiterator string +const auto logaddexp_complex_string = jiterator_stringify( + template + std::complex log1p(const std::complex& z) + { + using complex_t = std::complex; + T x = z.real(); + T y = z.imag(); + T zabs = abs(z); + T theta = atan2(y, x + T(1)); + if (zabs < 0.5) { + T r = x * (T(2) + x) + y * y; + if (r == 0) { // handle underflow + return complex_t(x, theta); + } + return complex_t(T(0.5) * std::log1p(r), theta); + } else { + T z0 = std::hypot(x + 1, y); + return complex_t(log(z0), theta); + } + } + + // separated _logaddexp_minmax into 2 different functions for jiterator_string + template + std::complex logaddexp_min(const std::complex& x, const std::complex& y) { + T xr = x.real(); + T yr = y.real(); + if (isnan(yr) || isnan(y.imag())) { + return y; + } else if (isnan(xr) || isnan(x.imag())) { + return x; + } else { + return (xr < yr) ? x : y; + } + } + + template + std::complex logaddexp_max(const std::complex& x, const std::complex& y) { + T xr = x.real(); + T yr = y.real(); + if (isnan(yr) || isnan(y.imag())) { + return y; + } else if (isnan(xr) || isnan(x.imag())) { + return x; + } else { + return (xr >= yr) ? x : y; + } + } + + template + std::complex fast_build_exp(const std::complex& x) { + const auto xreal = x.real(); + const auto ximag = x.imag(); + const auto exp_x_abs = exp(xreal); + auto exp_x_real = exp_x_abs * cos(ximag); + auto exp_x_imag = exp_x_abs * sin(ximag); + return std::complex(exp_x_real, exp_x_imag); + } + + template + std::complex fast_build_exp_inf(const std::complex& x) { + using complex_t = std::complex; + const auto ximag = x.imag(); + const T exp_x_abs = INFINITY; + if (!isfinite(ximag)) { + return complex_t(exp_x_abs, NAN); + } + const auto sin_val = sin(ximag); + const auto cos_val = cos(ximag); + auto exp_x_real = (cos_val == T(0)) ? T(0) : exp_x_abs * cos_val; + auto exp_x_imag = (sin_val == T(0)) ? T(0) : exp_x_abs * sin_val; + return complex_t(exp_x_real, exp_x_imag); + } + + template + complex_t logaddexp_complex(complex_t x, complex_t y) { + using T = typename complex_t::value_type; + complex_t min_val = logaddexp_min(x, y); + complex_t max_val = logaddexp_max(x, y); + T min_real = min_val.real(); + T max_real = max_val.real(); + + if (isnan(min_real) || isnan(min_val.imag())) { + return complex_t(NAN, NAN); + } + else if ((!isfinite(min_real)) && (min_real == max_real)) { + if (min_real < T(0)) { + return min_val; + } else { + const auto exp_min = fast_build_exp_inf(min_val); + const auto exp_max = fast_build_exp_inf(max_val); + return log1p(exp_min + exp_max - complex_t(1, 0)); + } + } else { + const auto minmax = min_val - max_val; + complex_t exp_minmax; + if (!isfinite(minmax.real())) { + exp_minmax = (minmax.real() < T(0)) ? complex_t(0, 0) : fast_build_exp_inf(minmax); + } else { + exp_minmax = fast_build_exp(minmax); + } + return log1p(exp_minmax) + max_val; + } + } +); + +constexpr char logaddexp_complex_name[] = "logaddexp_complex"; void logaddexp_kernel_cuda(TensorIteratorBase& iter) { - AT_DISPATCH_FLOATING_TYPES_AND2( + if (at::isComplexType(iter.dtype())) { +#if AT_USE_JITERATOR() + AT_DISPATCH_COMPLEX_TYPES_AND(at::ScalarType::ComplexHalf, iter.dtype(), "logaddexp_cuda", [&]() { + jitted_gpu_kernel< + /*name=*/logaddexp_complex_name, + /*return_dtype=*/scalar_t, + /*common_dtype=*/scalar_t, + /*arity=*/2>(iter, logaddexp_complex_string); + }); +#else + AT_DISPATCH_COMPLEX_TYPES_AND(at::ScalarType::ComplexHalf, iter.dtype(), "logaddexp_cuda", [&]() { + using opmath_t = at::opmath_type; + gpu_kernel(iter, [] GPU_LAMBDA (scalar_t a_, scalar_t b_) -> scalar_t { + const auto a = static_cast(a_); + const auto b = static_cast(b_); + return static_cast(_log_add_exp_helper(a, b)); + }); + }); +#endif + } else { + AT_DISPATCH_FLOATING_TYPES_AND2( ScalarType::BFloat16, ScalarType::Half, iter.dtype(), "logaddexp_cuda", [&]() { @@ -29,6 +261,7 @@ void logaddexp_kernel_cuda(TensorIteratorBase& iter) { } }); }); + } } void logaddexp2_kernel_cuda(TensorIteratorBase& iter) { diff --git a/aten/src/ATen/native/cuda/Reduce.cu b/aten/src/ATen/native/cuda/Reduce.cu index 36a1313488245..b32c55a10df6b 100644 --- a/aten/src/ATen/native/cuda/Reduce.cu +++ b/aten/src/ATen/native/cuda/Reduce.cu @@ -11,7 +11,7 @@ static inline std::ostream& operator<<(std::ostream& out, dim3 dim) { if (dim.y == 1 && dim.z == 1) { out << dim.x; } else { - out << "[" << dim.x << "," << dim.y << "," << dim.z << "]"; + out << '[' << dim.x << ',' << dim.y << ',' << dim.z << ']'; } return out; } @@ -27,7 +27,7 @@ std::ostream& operator<<(std::ostream& out, const ReduceConfig& config) { out << "input_mult=["; for (int i = 0; i < 3; i++) { if (i != 0) { - out << ","; + out << ','; } out << config.input_mult[i]; } @@ -35,7 +35,7 @@ std::ostream& operator<<(std::ostream& out, const ReduceConfig& config) { out << "output_mult=["; for (int i = 0; i < 2; i++) { if (i != 0) { - out << ","; + out << ','; } out << config.output_mult[i]; } @@ -49,7 +49,7 @@ std::ostream& operator<<(std::ostream& out, const ReduceConfig& config) { out << "block=" << config.block() << ", "; out << "grid=" << config.grid() << ", "; out << "global_memory_size=" << config.global_memory_size(); - out << ")"; + out << ')'; return out; } diff --git a/aten/src/ATen/native/cuda/ScaledBlas.cpp b/aten/src/ATen/native/cuda/ScaledBlas.cpp index 0d2963874abbd..4ff61f71f2b61 100644 --- a/aten/src/ATen/native/cuda/ScaledBlas.cpp +++ b/aten/src/ATen/native/cuda/ScaledBlas.cpp @@ -59,6 +59,24 @@ // forward declare class cublasCommonArgs; +#ifndef _WIN32 +namespace fbgemm_gpu { + +// NOTE(slayton58): FBGemm_GPU kernels come from within the FBGemm repo. +// To update supported ops means a submodule bump, which is.. painful. Instead, we +// can simply forward-declare the methods we want to use.. Works at least as a short-term +// thing, but should still be fixed somewhere/somehow. +at::Tensor f4f4bf16( + at::Tensor, + at::Tensor, + at::Tensor, + at::Tensor, + std::optional, + bool use_mx); + +} // namespace fbgemm_gpu +#endif + using at::blas::ScalingType; using at::blas::SwizzleType; @@ -722,7 +740,12 @@ _scaled_rowwise_rowwise( TORCH_CHECK_VALUE(scale_a.numel() == mat_a.size(0) && scale_a.scalar_type() == kFloat, "scale_a must have ", mat_a.size(0), " Float elements, got ", scale_a.numel()) TORCH_CHECK_VALUE(scale_b.numel() == mat_b.size(1) && scale_b.scalar_type() == kFloat, "scale_b must have ", mat_b.size(1), " Float elements, got ", scale_b.numel()) - TORCH_CHECK_VALUE(scale_a.stride(1) == 1, "expected scale_a.stride(1) to be 1, but got ", scale_a.stride(1)); + // if we have a scale of shape [256, 1] (say), then stride can be [1, 0] - handle this case + TORCH_CHECK_VALUE( + scale_a.stride(1) == 1 || + scale_a.size(1) == 1, + "expected scale_a.stride(1) to be 1, but got ", scale_a.stride(1) + ); TORCH_CHECK_VALUE(scale_b.stride(1) == 1, "expected scale_b.stride(1) to be 1, but got ", scale_b.stride(1)); auto scaling_choice_a = ScalingType::RowWise; @@ -1078,6 +1101,19 @@ _scaled_mxfp8_mxfp8( return _scaled_gemm(mat_a, mat_b, scale_a, scale_b, scaling_choice_a, scaling_choice_b, bias, false /* use_fast_accum */, out); } +void +_check_mxfp4_support() { +#ifndef USE_ROCM + auto dprops = at::cuda::getCurrentDeviceProperties(); + // Only on B200 GPUs + TORCH_CHECK_NOT_IMPLEMENTED( + // B200 = 10.0, B300 = 10.3 + dprops->major == 10, + "MXFP4 scaling only supported in CUDA for B200/B300" + ); +#endif +} + Tensor& _scaled_mxfp4_mxfp4( @@ -1087,26 +1123,48 @@ _scaled_mxfp4_mxfp4( const std::optional& bias, const c10::ScalarType out_dtype, Tensor& out) { -#ifndef USE_ROCM - TORCH_CHECK_NOT_IMPLEMENTED(false, "MXFP4 scaling supported on ROCM only"); -#endif +#if defined(_WIN32) || (!defined(USE_ROCM) && !defined(USE_FBGEMM_GENAI)) + TORCH_CHECK_NOT_IMPLEMENTED(false, "MXFP4 scaling supported on ROCM and CUDA+FBGEMM_GENAI only"); +#else + _check_mxfp4_support(); // Restrictions: // A, B are FP4, scales are e8m0, A: shape K//32, B: K, N//32 TORCH_CHECK_VALUE(mat_a.scalar_type() == at::kFloat4_e2m1fn_x2 && mat_b.scalar_type() == at::kFloat4_e2m1fn_x2, "mat_a and mat_b must be fp4 types, got: ", mat_a.scalar_type(), mat_b.scalar_type()); - auto scale_a_elems = ceil_div(2 * mat_a.size(0), 32) * mat_a.size(1); - auto scale_b_elems = ceil_div(2 * mat_b.size(1), 32) * mat_b.size(0); + // Packed FP4 format means actual-K = 2 * reported-K -- adjust + auto K_multiplier = 2; +#ifdef USE_ROCM + // AMD + auto scale_a_elems = ceil_div(K_multiplier * mat_a.size(0), 32) * mat_a.size(1); + auto scale_b_elems = ceil_div(K_multiplier * mat_b.size(1), 32) * mat_b.size(0); +#else + // NVIDIA + auto scale_a_elems = round_up(mat_a.size(0), 128) * round_up(ceil_div(K_multiplier * mat_a.size(1), 32), 4); + auto scale_b_elems = round_up(mat_b.size(1), 128) * round_up(ceil_div(K_multiplier * mat_b.size(0), 32), 4); +#endif TORCH_CHECK_VALUE(scale_a_elems == scale_a.numel(), "For Blockwise scaling scale_a should have ", scale_a_elems, " elements, got: ", scale_a.numel()); TORCH_CHECK_VALUE(scale_b_elems == scale_b.numel(), "For Blockwise scaling scale_b should have ", scale_b_elems, " elements, got: ", scale_b.numel()); +#ifdef USE_ROCM + // AMD + TORCH_CHECK_VALUE(swizzle_a == SwizzleType::NO_SWIZZLE, "scale_a must not be swizzled (NO_SWIZZLE format)"); + TORCH_CHECK_VALUE(swizzle_b == SwizzleType::NO_SWIZZLE, "scale_b must not be swizzled (NO_SWIZZLE format)"); +#else + // NVIDIA + TORCH_CHECK_VALUE(swizzle_a == SwizzleType::SWIZZLE_32_4_4, "scale_a must be swizzled to SWIZZLE_32_4_4 format"); + TORCH_CHECK_VALUE(swizzle_b == SwizzleType::SWIZZLE_32_4_4, "scale_b must be swizzled to SWIZZLE_32_4_4 format"); +#endif + TORCH_CHECK_VALUE(scale_a.is_contiguous() && scale_b.is_contiguous(), "For Blockwise scaling both scales should be contiguous"); TORCH_CHECK_VALUE(out.scalar_type() == out_dtype, "expected out.scalar_type() to be ", out_dtype, ", but got ", out_dtype); +#ifdef USE_ROCM + // AMD auto scaling_choice_a = ScalingType::BlockWise1x32; auto scaling_choice_b = ScalingType::BlockWise1x32; @@ -1121,11 +1179,30 @@ _scaled_mxfp4_mxfp4( TORCH_CHECK_VALUE(out.scalar_type() == ScalarType::BFloat16 || out.scalar_type() == ScalarType::Half, "Block-wise scaling only supports BFloat16 or Half output types"); -#else - TORCH_CHECK_NOT_IMPLEMENTED(false, "Block-wise scaling for Float8_e8m0fnu requires ROCm 7.0 or later"); #endif return _scaled_gemm(mat_a, mat_b, scale_a, scale_b, scaling_choice_a, scaling_choice_b, bias, false /* use_fast_accum */, out); +#else + // NVIDIA + // NOTE(slayton58): fbgemm_gpu::f4f4bf16 does *not* allow passing an output tensor, + // but we have one we need to use. Two clear options are to copy into + // our output (slow), or use a move-assignment-operator (faster). + // However, the compiler can complain about the explicit move preventing + // copy elision because the return from f4f4bf16 is a temporary object. + // So we don't explicitly move, and trust the compiler here... + // In the longer term this should be fixed on the FBGemm side. + out = fbgemm_gpu::f4f4bf16( + mat_a, + mat_b.transpose(-2, -1), + scale_a, + scale_b, + std::nullopt, /* global_scale */ + true /* use_mx */ + ); + + return out; +#endif +#endif } Tensor& @@ -1250,17 +1327,20 @@ _scaled_mm_cuda_v2_out( mat_a.size(0), "x", mat_a.size(1), " and ", mat_b.size(0), "x", mat_b.size(1), ")"); } + // Handle fp4 packed-K dimension + int K_multiplier = (mat_a.scalar_type() == ScalarType::Float4_e2m1fn_x2) ? 2 : 1; + TORCH_CHECK_VALUE(!bias || bias->numel() == mat_b.sizes()[1], "Bias must be size ", mat_b.sizes()[1], " but got ", bias->numel()); TORCH_CHECK_VALUE( - mat_a.sizes()[1] % 16 == 0, + K_multiplier * mat_a.sizes()[1] % 16 == 0, "Expected trailing dimension of mat1 to be divisible by 16 ", "but got mat1 shape: (", mat_a.sizes()[0], "x", - mat_a.sizes()[1], + K_multiplier * mat_a.sizes()[1], ")."); - TORCH_CHECK_VALUE(mat_b.sizes()[0] % 16 == 0 && mat_b.sizes()[1] % 16 == 0, "mat2 shape (", mat_b.sizes()[0], "x", + TORCH_CHECK_VALUE(K_multiplier * mat_b.sizes()[0] % 16 == 0 && mat_b.sizes()[1] % 16 == 0, "mat2 shape (", mat_b.sizes()[0], "x", mat_b.sizes()[1], ") must be divisible by 16"); // TODO(slayton): Existing checks, not sure if they should really be here. diff --git a/aten/src/ATen/native/cuda/ScaledGroupMM.cu b/aten/src/ATen/native/cuda/ScaledGroupMM.cu index 9a06c5907febc..71c9c8dac766d 100644 --- a/aten/src/ATen/native/cuda/ScaledGroupMM.cu +++ b/aten/src/ATen/native/cuda/ScaledGroupMM.cu @@ -364,9 +364,9 @@ void f8f8bf16_grouped_gemm_impl_sm90( // reinterpret_cast( // stride_output_h + group_count); - // std::cout << "PTRS " << mat_a.data_ptr() << " " << mat_b.data_ptr() << " + // std::cout << "PTRS " << mat_a.data_ptr() << ' ' << mat_b.data_ptr() << " // " - // << out.data_ptr() << " " << scale_a.data_ptr() << " " + // << out.data_ptr() << ' ' << scale_a.data_ptr() << ' ' // << scale_b.data_ptr() << "\n"; // for (int i = 0; i < group_count; i++) { // std::cout << "A " << (void*)inputA_ptrs_h[i] << "\n"; diff --git a/aten/src/ATen/native/cuda/ScanUtils.cuh b/aten/src/ATen/native/cuda/ScanUtils.cuh index c4d86acb43e7b..693ad0cb6ce10 100644 --- a/aten/src/ATen/native/cuda/ScanUtils.cuh +++ b/aten/src/ATen/native/cuda/ScanUtils.cuh @@ -267,15 +267,15 @@ void scan_dim_with_indices(const TensorBase& self, const TensorBase& values, con * outer dimensions, which contains several "inner rows"). * Each thread processes a single inner row at a time. */ -template +template __global__ void tensor_kernel_scan_outer_dim(scalar_t *tgt_, const scalar_t *src_, const uint32_t num_orows, const uint32_t num_irows, const uint32_t row_size, const scalar_t init, BinaryOp binary_op) { for (uint32_t orow = blockIdx.x; orow < num_orows; orow += gridDim.x) { for (uint32_t irow = blockIdx.y * blockDim.x + threadIdx.x; irow < num_irows; irow += gridDim.y * blockDim.x) { - const scalar_t *src = src_ + orow * row_size * num_irows + irow; - scalar_t *tgt = tgt_ + orow * row_size * num_irows + irow; + const scalar_t *src = src_ + static_cast(orow) * row_size * num_irows + irow; + scalar_t *tgt = tgt_ + (index_t) orow * row_size * num_irows + irow; scalar_t acc = init; for (uint32_t col = 0; col < row_size; ++col) { @@ -409,10 +409,15 @@ __host__ void scan_outer_dim(const TensorBase& self, const TensorBase& result, check_fits_in_unsigned(num_irows, "num_irows"); check_fits_in_unsigned(num_orows, "num_orows"); check_fits_in_unsigned(row_size, "row_size"); - - tensor_kernel_scan_outer_dim<<>>( + if (static_cast(num_irows) * num_orows * row_size <= UINT_MAX) { + tensor_kernel_scan_outer_dim<<>>( + result.mutable_data_ptr(), self.const_data_ptr(), + num_orows, num_irows, row_size, init, binary_op); + } else { + tensor_kernel_scan_outer_dim<<>>( result.mutable_data_ptr(), self.const_data_ptr(), num_orows, num_irows, row_size, init, binary_op); + } C10_CUDA_KERNEL_LAUNCH_CHECK(); } diff --git a/aten/src/ATen/native/cuda/jit_utils.cpp b/aten/src/ATen/native/cuda/jit_utils.cpp index 09c8e74d4b2cf..e65fa4ceb38e9 100644 --- a/aten/src/ATen/native/cuda/jit_utils.cpp +++ b/aten/src/ATen/native/cuda/jit_utils.cpp @@ -1057,14 +1057,14 @@ std::string generate_code( // TODO these arrays are potentially of the different types, use function // traits to determine the types declare_load_arrays << f_inputs_type << " arg" << std::to_string(i) - << "[" << std::to_string(thread_work_size) << "];\n"; + << '[' << std::to_string(thread_work_size) << "];\n"; } env.s("declare_load_arrays", declare_load_arrays.str()); std::stringstream declare_store_arrays; for (int i = 0; i < nOutputs; i++) { declare_store_arrays << result_type << " out" << std::to_string(i) - << "[" << std::to_string(thread_work_size) << "];\n"; + << '[' << std::to_string(thread_work_size) << "];\n"; } env.s("declare_store_arrays", declare_store_arrays.str()); @@ -1217,7 +1217,7 @@ std::string generate_code( for (const auto i : c10::irange(nInputs)){ auto i_string = std::to_string(i); vector_inputs << "auto * input" << i_string << - " = reinterpret_cast(data[" << i_string << "+" << nOutputs << "])" << + " = reinterpret_cast(data[" << i_string << '+' << nOutputs << "])" << " + block_work_size * idx;\n"; } env.s("vector_inputs", vector_inputs.str()); @@ -1543,17 +1543,17 @@ NvrtcFunction jit_pwise_function( // Constructs file path by appending constructed cubin name to cache path std::stringstream ss; - ss << *cache_dir << "/"; + ss << *cache_dir << '/'; ss << kernel_name; #ifdef USE_ROCM ss << "_arch" << prop->gcnArchName; #else - ss << "_arch" << cuda_major << "." << cuda_minor; + ss << "_arch" << cuda_major << '.' << cuda_minor; #endif - ss << "_nvrtc" << nvrtc_major << "." << nvrtc_minor; + ss << "_nvrtc" << nvrtc_major << '.' << nvrtc_minor; ss << (compile_to_sass ? "_sass" : "_ptx"); - ss << "_" << code.length(); - ss << "_" << hash_code; + ss << '_' << code.length(); + ss << '_' << hash_code; file_path = ss.str(); std::ifstream readin{file_path, std::ios::in | std::ifstream::binary}; diff --git a/aten/src/ATen/native/cudnn/ConvShared.cpp b/aten/src/ATen/native/cudnn/ConvShared.cpp index 325b082f314d9..1584d5e9acd38 100644 --- a/aten/src/ATen/native/cudnn/ConvShared.cpp +++ b/aten/src/ATen/native/cudnn/ConvShared.cpp @@ -82,15 +82,15 @@ namespace native { std::ostream& operator<<(std::ostream& out, const ConvolutionParams& params) { out << "ConvolutionParams \n" - << " memory_format = " << params.memory_format << "\n" - << " data_type = " << cudnnTypeToString(params.dataType) << "\n" - << " padding = " << ArrayRef{params.padding} << "\n" - << " stride = " << ArrayRef{params.stride} << "\n" - << " dilation = " << ArrayRef{params.dilation} << "\n" - << " groups = " << params.groups << "\n" + << " memory_format = " << params.memory_format << '\n' + << " data_type = " << cudnnTypeToString(params.dataType) << '\n' + << " padding = " << ArrayRef{params.padding} << '\n' + << " stride = " << ArrayRef{params.stride} << '\n' + << " dilation = " << ArrayRef{params.dilation} << '\n' + << " groups = " << params.groups << '\n' << " deterministic = " << (params.deterministic ? "true" : "false") - << "\n" - << " allow_tf32 = " << (params.allow_tf32 ? "true" : "false") << "\n"; + << '\n' + << " allow_tf32 = " << (params.allow_tf32 ? "true" : "false") << '\n'; return out; } @@ -173,16 +173,16 @@ std::string repro_from_args(const ConvolutionParams& params) { at::globalContext().float32Precision( at::Float32Backend::CUDA, at::Float32Op::MATMUL) == at::Float32Precision::TF32) - << "\n"; + << '\n'; ss << "torch.backends.cudnn.benchmark = " - << pybool(at::globalContext().benchmarkCuDNN()) << "\n"; + << pybool(at::globalContext().benchmarkCuDNN()) << '\n'; ss << "torch.backends.cudnn.deterministic = " << pybool(params.deterministic) - << "\n"; + << '\n'; ss << "torch.backends.cudnn.allow_tf32 = " << pybool(params.allow_tf32) - << "\n"; + << '\n'; ss << "data = torch.randn(" << ArrayRef(params.input_size, dim) << ", dtype=" << full_dtype << ", "; - ss << "device='cuda', requires_grad=True)" << to_channels_last << "\n"; + ss << "device='cuda', requires_grad=True)" << to_channels_last << '\n'; ss << "net = torch.nn.Conv" << dim - 2 << "d(" << in_channels << ", " << out_channels << ", "; ss << "kernel_size=" << ArrayRef(¶ms.weight_size[2], dim - 2) @@ -192,7 +192,7 @@ std::string repro_from_args(const ConvolutionParams& params) { ss << "dilation=" << ArrayRef(params.dilation, dim - 2) << ", "; ss << "groups=" << params.groups << ")\n"; ss << "net = net.cuda()." << partial_dtype << "()" << to_channels_last - << "\n"; + << '\n'; ss << "out = net(data)\n"; ss << "out.backward(torch.randn_like(out))\n"; ss << "torch.cuda.synchronize()\n\n"; diff --git a/aten/src/ATen/native/cudnn/Conv_v7.cpp b/aten/src/ATen/native/cudnn/Conv_v7.cpp index bc064e3ad3167..d5102910c6471 100644 --- a/aten/src/ATen/native/cudnn/Conv_v7.cpp +++ b/aten/src/ATen/native/cudnn/Conv_v7.cpp @@ -93,11 +93,10 @@ std::ostream& operator<<(std::ostream& out, const ConvolutionArgs& args) { << "input: " << args.idesc // already has a trailing newline << "output: " << args.odesc // already has a trailing newline << "weight: " << args.wdesc // already has a trailing newline - << "Pointer addresses: " - << "\n" - << " input: " << args.input.const_data_ptr() << "\n" - << " output: " << args.output.const_data_ptr() << "\n" - << " weight: " << args.weight.const_data_ptr() << "\n"; + << "Pointer addresses: " << '\n' + << " input: " << args.input.const_data_ptr() << '\n' + << " output: " << args.output.const_data_ptr() << '\n' + << " weight: " << args.weight.const_data_ptr() << '\n'; return out; } diff --git a/aten/src/ATen/native/hip/ck_group_gemm.h b/aten/src/ATen/native/hip/ck_group_gemm.h new file mode 100644 index 0000000000000..c50307c9f8ea3 --- /dev/null +++ b/aten/src/ATen/native/hip/ck_group_gemm.h @@ -0,0 +1,19 @@ +#pragma once + +#include +#include +#include + +namespace at { +namespace hip { +namespace detail { +void group_gemm_ck( + const at::Tensor& mat_a, + const at::Tensor& mat_b, + const std::optional& offs, + const std::optional& bias, + at::Tensor& out); + +} // namespace detail +} // namespace hip +} // namespace at diff --git a/aten/src/ATen/native/hip/ck_group_gemm.hip b/aten/src/ATen/native/hip/ck_group_gemm.hip new file mode 100644 index 0000000000000..c436ad660c1c7 --- /dev/null +++ b/aten/src/ATen/native/hip/ck_group_gemm.hip @@ -0,0 +1,462 @@ +#undef __HIP_NO_HALF_CONVERSIONS__ +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +template +using S = ck::Sequence; + +namespace at { +namespace hip { +namespace detail { + +namespace CkTypes { + using BF16 = ck::bhalf_t; + using F16 = ck::half_t; + using F32 = float; + using PassThrough = ck::tensor_operation::element_wise::PassThrough; +} + +template +using GroupedGemmKernel = ck::tensor_operation::device::DeviceGroupedGemmMultipleDSplitKXdlCShuffleTwoStage< + ALayout, BLayout, ck::Tuple<>, ck::tensor_layout::gemm::RowMajor, + DataType, DataType, CkTypes::F32, DataType, ck::Tuple<>, DataType, + CkTypes::PassThrough, CkTypes::PassThrough, CkTypes::PassThrough, + ck::tensor_operation::device::GemmSpecialization::MNKPadding, + 1, 256, 256, 128, 32, 8, 8, 32, 32, 4, 2, + S<1,4,64,1>, S<0,2,1,3>, S<0,2,1,3>, + 3, 8, 8, 1, + S<1,4,64,1>, S<0,2,1,3>, S<0,2,1,3>, + 3, 8, 8, 1, + 1, 1, + S<1,32,1,8>, 4 +>; + +template +void launch_grouped_bgemm_ck_impl_dispatch( + const at::Tensor& mat_a, + const at::Tensor& mat_b, + const std::optional& offs, + at::Tensor& out) +{ + using DeviceOp = GroupedGemmKernel; + using PassThrough = CkTypes::PassThrough; + + std::vector gemm_descs; + std::vector p_a_ptrs, p_b_ptrs; + std::vector p_e_ptrs; + // Note: d_ptrs will be resized after we populate the other vectors + + const int mat_a_dim = mat_a.dim(); + const int mat_b_dim = mat_b.dim(); + + const char* a_ptr_base = reinterpret_cast(mat_a.data_ptr()); + const char* b_ptr_base = reinterpret_cast(mat_b.data_ptr()); + char* out_ptr_base = reinterpret_cast(out.data_ptr()); + const size_t a_element_size = mat_a.element_size(); + const size_t b_element_size = mat_b.element_size(); + const size_t out_element_size = out.element_size(); + + // for each group, calculate m,n,k,lda,ldb,ldc and A,B,out pointer base addresses. + if (mat_a_dim == 2 && mat_b_dim == 2) { + // 2D*2D case requires offset tensor + auto offs_accessor = offs->accessor(); + int num_groups = offs_accessor.size(0); + const int M = mat_a.size(0); // number of rows in A + const int N = mat_b.size(1); // number of columns in B + const int K = mat_a.size(1); // columns in A == rows in B + // for 2d*2d input, output is 3d. + // for each group, A columns (K) are sliced. M and N dimensions are not sliced. + for (int i = 0; i < num_groups; ++i) { + int start_k = (i == 0) ? 0 : offs_accessor[i-1]; + int end_k = offs_accessor[i]; + int k = end_k - start_k; + + //K dimension are sliced, hence select stride(1) always. + //K dimension is always dimension 1, regardless of memory layout (row/column major) + const void* group_a_ptr = a_ptr_base + start_k * mat_a.stride(1) * a_element_size; + const void* group_b_ptr; + int ldb; + + if (std::is_same::value) { + // Row-major B [K,N]: K values are horizontally adjacent, use stride(1) for K offset + group_b_ptr = b_ptr_base + start_k * mat_b.stride(1) * b_element_size; + // Leading dimension = distance between rows = stride(0) + ldb = mat_b.stride(0); + } else { + // Column-major B [K,N]: K values are vertically adjacent, use stride(0) for K offset + group_b_ptr = b_ptr_base + start_k * mat_b.stride(0) * b_element_size; + // Leading dimension = distance between columns = stride(1) + ldb = mat_b.stride(1); + } + + // Calculate output pointer for group i in 3D tensor [num_groups, M, N] + // stride(0) = M*N elements between groups, so skip i*stride(0) elements to reach group i + void* group_e_ptr = out_ptr_base + i * out.stride(0) * out_element_size; + int lda, ldc; + if (std::is_same::value) { + // Row-major A [M,K]: leading dimension = distance between rows = stride(0) + lda = mat_a.stride(0); + } else { + // Column-major A [M,K]: leading dimension = distance between columns = stride(1) + lda = mat_a.stride(1); + } + // Output is always row-major in 3D tensor [num_groups, M, N] + // Leading dimension for each group's [M,N] slice = stride(1) = N + ldc = out.stride(1); + size_t output_group_bytes = M * N * out_element_size; + void* group_e_ptr_end = (char*)group_e_ptr + output_group_bytes; + + gemm_descs.push_back({ + static_cast(M), + static_cast(N), + static_cast(k), + static_cast(lda), + static_cast(ldb), + static_cast(ldc), + {} // --> stride_Ds_ + }); + p_a_ptrs.push_back(group_a_ptr); + p_b_ptrs.push_back(group_b_ptr); + p_e_ptrs.push_back(group_e_ptr); + } + } else if (mat_a_dim == 2 && mat_b_dim == 3) { + // 2D*3D case requires offset tensor + auto offs_accessor = offs->accessor(); + int num_groups = offs_accessor.size(0); + + // 2d*3d input, output is 2d. + // A: [m * n_groups, k], B: [n_groups, n, k] or [n_groups, k, n], Output: [m * n_groups, n] + // Offset divides M dimension (rows of A), each group gets different rows of A and different batch of B + const int K = mat_a.size(1); // columns in A + // For 2D-3D case: The output determines N (result width) + const int N = out.size(1); // N is the width of the output tensor + + for (int i = 0; i < num_groups; ++i) { + int start_m = (i == 0) ? 0 : offs_accessor[i - 1]; + int end_m = offs_accessor[i]; + int m = end_m - start_m; + + // Skip zero-sized groups but continue processing subsequent groups + if (m <= 0) { + continue; + } + + // Select A rows for group i: skip start_m rows + const void* group_a_ptr; + int lda; + if (std::is_same::value) { + // Row-major A [total_m, K]: skip start_m rows, each row is stride(0) elements apart + group_a_ptr = a_ptr_base + start_m * mat_a.stride(0) * a_element_size; + lda = mat_a.stride(0); // distance between rows + } else { + // Column-major A [total_m, K]: skip start_m elements in the first dimension (stride(0) is between rows) + group_a_ptr = a_ptr_base + start_m * mat_a.stride(0) * a_element_size; + + // Detect stride pattern for A tensor to determine appropriate lda calculation + bool a_is_strided_tensor = (mat_a.stride(0) > mat_a.size(0)); + + if (a_is_strided_tensor) { + // For strided A tensors: stride(0) gives the actual leading dimension + lda = mat_a.stride(0); + } else { + // For non-strided A tensors: use the M dimension (total rows) + lda = mat_a.size(0); // Total M dimension for column-major layout + } + } + + // Select B batch for group i: B[i, :, :] + const void* group_b_ptr = b_ptr_base + i * mat_b.stride(0) * b_element_size; + int ldb; + + if (std::is_same::value) { + // Row-major GEMM: expecting B as [K, N] but we have [N, K], so transpose needed + ldb = mat_b.stride(2); // Leading dimension for accessing as [K, N] + } else { + // Detect stride pattern to determine appropriate ldb calculation + bool is_strided_tensor = (mat_b.stride(2) > mat_b.size(2)); + + if (is_strided_tensor) { + // For strided tensors: stride(2) gives the actual leading dimension + ldb = mat_b.stride(2); + } else { + // For non-strided tensors: use the N dimension + ldb = mat_b.size(1); + } + } + + // Output for this group: rows [start_m:end_m, :] in 2D output [total_m, N] + void* group_e_ptr = out_ptr_base + start_m * out.stride(0) * out_element_size; + int ldc = out.stride(0); // distance between rows in output (should be N for 2D case) + + gemm_descs.push_back({ + static_cast(m), + static_cast(N), + static_cast(K), + static_cast(lda), + static_cast(ldb), + static_cast(ldc), + {} // --> stride_Ds_ + }); + p_a_ptrs.push_back(group_a_ptr); + p_b_ptrs.push_back(group_b_ptr); + p_e_ptrs.push_back(group_e_ptr); + } + } else if (mat_a_dim == 3 && mat_b_dim == 3) { + // 3d*3d input, output is 3d - batched matrix multiplication + // A: [batch, m, k], B: [batch, k, n] or [batch, n, k] (depending on transpose), Output: [batch, m, n] + // Each batch is processed as a separate GEMM operation + const int batch_size = mat_a.size(0); + const int M = mat_a.size(1); // rows in each A matrix + const int K = mat_a.size(2); // columns in A == rows in B (or columns if B is transposed) + + // Determine N from B tensor - it could be B.size(1) or B.size(2) depending on layout + int N; + if (mat_b.size(1) == K) { + // B is [batch, k, n] - normal layout + N = mat_b.size(2); + } else if (mat_b.size(2) == K) { + // B is [batch, n, k] - transposed layout + N = mat_b.size(1); + } else { + TORCH_CHECK(false, "CK Group GEMM 3D-3D: B tensor dimensions incompatible with A. A=[", + batch_size, ",", M, ",", K, "], B=[", mat_b.size(0), ",", mat_b.size(1), ",", mat_b.size(2), "]"); + } + + for (int i = 0; i < batch_size; ++i) { + // Select A batch for group i: A[i, :, :] + const void* group_a_ptr = a_ptr_base + i * mat_a.stride(0) * a_element_size; + + // Select B batch for group i: B[i, :, :] + const void* group_b_ptr = b_ptr_base + i * mat_b.stride(0) * b_element_size; + + // Select output batch for group i: Output[i, :, :] + void* group_e_ptr = out_ptr_base + i * out.stride(0) * out_element_size; + + int lda, ldb, ldc; + + if (std::is_same::value) { + // Row-major A: leading dimension = distance between rows = stride(1) + lda = mat_a.stride(1); + } else { + // Column-major A: leading dimension = distance between columns = stride(2) + lda = mat_a.stride(2); + } + + if (std::is_same::value) { + // Row-major B: leading dimension = distance between rows + if (mat_b.size(1) == K) { + // B is [batch, k, n] - normal layout + ldb = mat_b.stride(1); // stride between K rows + } else { + // B is [batch, n, k] - transposed layout, treat as [k, n] for GEMM + ldb = mat_b.stride(2); // stride between N rows (since we're accessing as [k,n]) + } + } else { + // Column-major B: leading dimension = distance between columns + if (mat_b.size(1) == K) { + // B is [batch, k, n] - normal layout + ldb = mat_b.stride(2); // stride between N columns + } else { + // B is [batch, n, k] - transposed layout + ldb = mat_b.stride(1); // stride between K columns (since we're accessing as [n,k]→[k,n]) + } + } + + // Output is typically row-major: leading dimension = distance between rows = stride(1) + ldc = out.stride(1); + + gemm_descs.push_back({ + static_cast(M), + static_cast(N), + static_cast(K), + static_cast(lda), + static_cast(ldb), + static_cast(ldc), + {} // --> stride_Ds_ + }); + p_a_ptrs.push_back(group_a_ptr); + p_b_ptrs.push_back(group_b_ptr); + p_e_ptrs.push_back(group_e_ptr); + } + } else if (mat_a_dim == 3 && mat_b_dim == 2) { + // 3D*2D case requires offset tensor + auto offs_accessor = offs->accessor(); + int num_groups = offs_accessor.size(0); + // 3d*2d input, output is 3d. + // A: [n_groups, m, k], B: [k, total_n] (assuming row-major for both) + // Offset divides N dimension of B, each group gets different slice of B and different batch of A + const int batch_size = mat_a.size(0); // n_groups + const int M = mat_a.size(1); // rows in each A matrix + const int K = mat_a.size(2); // columns in A + + // For row-major A and B case: B should be [K, total_N] + const int total_N = mat_b.size(1); // B is [K, total_N] for row-major + + for (int i = 0; i < num_groups; ++i) { + int start_n = (i == 0) ? 0 : offs_accessor[i - 1]; + int end_n = offs_accessor[i]; + int n = end_n - start_n; + + // Skip zero-sized groups but continue processing subsequent groups + if (n <= 0) { + continue; + } + + // Select A batch for group i: A[i, :, :] + const void* group_a_ptr = a_ptr_base + i * mat_a.stride(0) * a_element_size; + + // Select B slice for group i: B[:, start_n:end_n] (B[K, total_N]) + const void* group_b_ptr; + int ldb; + + // Check if B is row-major or column-major + if (std::is_same::value) { + // Row-major B [K, total_N]: slice columns [start_n:end_n] + group_b_ptr = b_ptr_base + start_n * mat_b.stride(1) * b_element_size; + ldb = mat_b.stride(0); // distance between rows (should be total_N) + } else { + // Column-major B [K, total_N]: slice columns [start_n:end_n] + group_b_ptr = b_ptr_base + start_n * mat_b.stride(1) * b_element_size; + ldb = mat_b.stride(1); // distance between columns (should be K) + } + + // Select output slice for group i: Output[:, start_n:end_n] + void* group_e_ptr = out_ptr_base + start_n * out.stride(1) * out_element_size; + + int lda, ldc; + + // Row-major A: leading dimension = distance between rows = stride(1) + lda = mat_a.stride(1); + // Output is row-major: leading dimension = distance between rows = stride(0) + ldc = out.stride(0); + + gemm_descs.push_back({ + static_cast(M), + static_cast(n), + static_cast(K), + static_cast(lda), + static_cast(ldb), + static_cast(ldc), + {} // --> stride_Ds_ + }); + p_a_ptrs.push_back(group_a_ptr); + p_b_ptrs.push_back(group_b_ptr); + p_e_ptrs.push_back(group_e_ptr); + } + } else { + TORCH_CHECK(false, "CK Group GEMM: Unsupported dimensions, mat A dim is ", mat_a_dim, ", mat B dim is ", mat_b_dim); + } + + TORCH_INTERNAL_ASSERT(p_a_ptrs.size() > 0, "CK Group GEMM: No valid groups"); + + // Initialize d_ptrs with the correct size + std::vector> d_ptrs(p_a_ptrs.size()); + + static DeviceOp gemm_instance; + auto argument = gemm_instance.MakeArgument( + p_a_ptrs, p_b_ptrs, d_ptrs, p_e_ptrs, + gemm_descs, PassThrough{}, PassThrough{}, PassThrough{} + ); + TORCH_INTERNAL_ASSERT(gemm_instance.IsSupportedArgument(argument), + "CK Group GEMM: argument unsupported (shape/strides/type config)"); + size_t arg_buf_size = gemm_instance.GetDeviceKernelArgSize(&argument); + size_t ws_size = gemm_instance.GetWorkSpaceSize(&argument); + + void* gemm_arg_buf = nullptr; + void* ws_buf = nullptr; + + hipMalloc(&gemm_arg_buf, arg_buf_size); + hipMalloc(&ws_buf, ws_size); + + gemm_instance.SetDeviceKernelArgs(&argument, gemm_arg_buf); + gemm_instance.SetWorkSpacePointer(&argument, ws_buf); + + auto invoker = gemm_instance.MakeInvoker(); + hipStream_t stream = c10::hip::getCurrentHIPStream(); + invoker.Run(argument, {stream}); + hipFree(gemm_arg_buf); + hipFree(ws_buf); +} + +void group_gemm_ck( + const at::Tensor& input_a, + const at::Tensor& input_b_colmajor, + const std::optional& offs, + const std::optional& /*bias*/, + at::Tensor& out) +{ + // Detect if input_a is row-major based on stride pattern + bool a_row_major = (input_a.dim() == 3) ? (input_a.stride(2) == 1) : (input_a.stride(1) == 1); + bool b_col_major = (input_b_colmajor.dim() == 3) ? (input_b_colmajor.stride(1) == 1) : (input_b_colmajor.stride(0) == 1); + // Ensure tensor A is row-major and contiguous if not already + at::Tensor mat_a = input_a; + if (!a_row_major) { + // If A is not row-major, make it contiguous (row-major) + mat_a = input_a.contiguous(); + } + // Force tensor B to be column-major using double transpose trick + // This guarantees stride(0) == 1 and stride(1) == K for [K, N] shape + at::Tensor mat_b = input_b_colmajor; + if (!b_col_major) { + mat_b = input_b_colmajor.transpose(-2, -1).contiguous().transpose(-2, -1); + } + + // For 3D tensors, check the last dimension stride for row-major detection + a_row_major = (mat_a.dim() == 3) ? (mat_a.stride(2) == 1) : (mat_a.stride(1) == 1); + bool b_row_major = (mat_b.dim() == 3) ? (mat_b.stride(2) == 1) : (mat_b.stride(1) == 1); + + if (mat_a.dtype() == at::kBFloat16) { + // bf16 path + if (a_row_major && b_row_major) { + launch_grouped_bgemm_ck_impl_dispatch(mat_a, mat_b, offs, out); + } else if (a_row_major && !b_row_major) { + launch_grouped_bgemm_ck_impl_dispatch(mat_a, mat_b, offs, out); + } else if (!a_row_major && b_row_major) { + launch_grouped_bgemm_ck_impl_dispatch(mat_a, mat_b, offs, out); + } else { + launch_grouped_bgemm_ck_impl_dispatch(mat_a, mat_b, offs, out); + } + } else if (mat_a.dtype() == at::kHalf) { + // fp16 path + if (a_row_major && b_row_major) { + launch_grouped_bgemm_ck_impl_dispatch(mat_a, mat_b, offs, out); + } else if (a_row_major && !b_row_major) { + launch_grouped_bgemm_ck_impl_dispatch(mat_a, mat_b, offs, out); + } else if (!a_row_major && b_row_major) { + launch_grouped_bgemm_ck_impl_dispatch(mat_a, mat_b, offs, out); + } else { + launch_grouped_bgemm_ck_impl_dispatch(mat_a, mat_b, offs, out); + } + } else if (mat_a.dtype() == at::kFloat) { + // fp32 path + if (a_row_major && b_row_major) { + launch_grouped_bgemm_ck_impl_dispatch(mat_a, mat_b, offs, out); + } else if (a_row_major && !b_row_major) { + launch_grouped_bgemm_ck_impl_dispatch(mat_a, mat_b, offs, out); + } else if (!a_row_major && b_row_major) { + launch_grouped_bgemm_ck_impl_dispatch(mat_a, mat_b, offs, out); + } else { + launch_grouped_bgemm_ck_impl_dispatch(mat_a, mat_b, offs, out); + } + } else { + TORCH_CHECK(false, "CK Group GEMM: Unsupported mat_a dtype"); + } + +} + +} // namespace detail +} // namespace hip +} // namespace at diff --git a/aten/src/ATen/native/metal/MetalTensorImplStorage.mm b/aten/src/ATen/native/metal/MetalTensorImplStorage.mm index f614429eefddf..20a942a9e2573 100644 --- a/aten/src/ATen/native/metal/MetalTensorImplStorage.mm +++ b/aten/src/ATen/native/metal/MetalTensorImplStorage.mm @@ -115,7 +115,7 @@ void copy_data_to_host(float* host) { std::copy( strides.begin(), strides.end() - 1, std::ostream_iterator(oss, ",")); oss << sizes.back(); - output << oss.str() << "}"; + output << oss.str() << '}'; return output; } diff --git a/aten/src/ATen/native/mkldnn/xpu/Conv.cpp b/aten/src/ATen/native/mkldnn/xpu/Conv.cpp index 67558aeebbb83..6827e02cc3f42 100644 --- a/aten/src/ATen/native/mkldnn/xpu/Conv.cpp +++ b/aten/src/ATen/native/mkldnn/xpu/Conv.cpp @@ -53,7 +53,7 @@ std::ostream& operator<<(std::ostream& out, const ConvParams& params) { << " transposed = " << params.transposed << " output_padding = " << IntArrayRef{params.output_padding} << " groups = " << params.groups << " benchmark = " << params.benchmark - << " deterministic = " << params.deterministic << "}"; + << " deterministic = " << params.deterministic << '}'; return out; } @@ -337,10 +337,6 @@ Tensor _convolution_out( TORCH_CHECK( 3 == ndim || 4 == ndim || 5 == ndim, "convolution only supports 3D, 4D, 5D tensor"); - // get computation format for Conv/TransposedConv - bool is_channels_last_suggested = - use_channels_last_for_conv(input_r, weight_r); - Tensor input = input_r, weight = weight_r; // PyTorch does not support ChannelsLast1D case, // thus we need the transformation here @@ -348,13 +344,8 @@ Tensor _convolution_out( input = view4d(input_r); weight = view4d(weight_r); } - // ensure the input/weight/bias/output are congituous in desired format - at::MemoryFormat mfmt = is_channels_last_suggested - ? get_cl_tag_by_ndim(input.ndimension()) - : at::MemoryFormat::Contiguous; - auto bias = bias_r.defined() ? bias_r.contiguous() : bias_r; - input = input.contiguous(mfmt); - weight = weight.contiguous(mfmt); + // get computation format for Conv/TransposedConv + bool is_channels_last_suggested = use_channels_last_for_conv(input, weight); auto k = weight.ndimension(); if (k == input.ndimension() + 1) { @@ -388,6 +379,14 @@ Tensor _convolution_out( expand_param_if_needed(output_padding_, "output_padding", dim); params.groups = groups_; } + + // ensure the input/weight/bias/output are congituous in desired format + at::MemoryFormat mfmt = is_channels_last_suggested + ? get_cl_tag_by_ndim(input.ndimension()) + : at::MemoryFormat::Contiguous; + auto bias = bias_r.defined() ? bias_r.contiguous() : bias_r; + input = input.contiguous(mfmt); + weight = weight.contiguous(mfmt); check_shape_forward(input, weight, bias, params, true); Tensor output; @@ -514,18 +513,9 @@ Tensor convolution_overrideable( at::borrow_from_optional_tensor(bias_r_opt); const Tensor& bias_r = *bias_r_maybe_owned; - auto k = weight_r.ndimension(); - at::MemoryFormat backend_memory_format = at::MemoryFormat::Contiguous; - if (xpu_conv_use_channels_last(input_r, weight_r)) { - backend_memory_format = (k == 5) ? at::MemoryFormat::ChannelsLast3d - : at::MemoryFormat::ChannelsLast; - } - Tensor input_c = input_r.contiguous(backend_memory_format); - Tensor weight_c = weight_r.contiguous(backend_memory_format); - return _convolution( - input_c, - weight_c, + input_r, + weight_r, bias_r, stride_, padding_, diff --git a/aten/src/ATen/native/mkldnn/xpu/ScaledBlas.cpp b/aten/src/ATen/native/mkldnn/xpu/ScaledBlas.cpp new file mode 100644 index 0000000000000..2b715c053abc3 --- /dev/null +++ b/aten/src/ATen/native/mkldnn/xpu/ScaledBlas.cpp @@ -0,0 +1,738 @@ +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS +#include +#include +#include +#include +#include +#include +#include +#include + +#ifndef AT_PER_OPERATOR_HEADERS +#include +#include +#else +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#endif + +namespace at::native { + +using at::blas::ScalingType; +using at::blas::SwizzleType; + +namespace { +/* + * Scaling Type Determination: + * --------------------------- + * Conditions and corresponding Scaling Types: + * + * - If scale tensor is `Float8_e8m0fnu` or `Float8_e4m3fn`: + * - Returns BlockWise (with additional size checks). + * + * - Else if scale.numel() == 1: + * - Returns TensorWise. + * + * - Else if scale.dim() == 2 && scale.size(0) == outer_dim && scale.size(1) == + * 1: + * - Returns RowWise. + * + * - Otherwise: + * - Returns Error. + */ + +bool is_tensorwise_scaling(const at::Tensor& t, const at::Tensor& scale) { + return at::isFloat8Type(t.scalar_type()) && + scale.scalar_type() == at::kFloat && scale.numel() == 1; +} + +bool is_rowwise_scaling(const at::Tensor& t, const at::Tensor& scale) { + return ( + at::isFloat8Type(t.scalar_type()) && scale.scalar_type() == at::kFloat && + scale.dim() == 2 && scale.size(0) == t.size(0) && scale.size(1) == 1 && + scale.is_contiguous()); +} + +bool is_desired_scaling( + const at::Tensor& t, + const at::Tensor& scale, + ScalingType desired_scaling) { + auto result = desired_scaling == ScalingType::TensorWise + ? is_tensorwise_scaling(t, scale) + : is_rowwise_scaling(t, scale); + return result; +} + +std::pair get_joint_scaling( + std::initializer_list> options, + const at::Tensor& a, + const at::Tensor& b, + const at::Tensor& scale_a, + const at::Tensor& scale_b) { + for (auto [lhs, rhs] : options) { + if (is_desired_scaling(a, scale_a, lhs) && + is_desired_scaling(b.t(), scale_b.t(), rhs)) { + return {lhs, rhs}; + } + } + TORCH_CHECK( + false, + "Invalid scaling configuration.\n" + "- For TensorWise scaling, a and b should be float8, scales should be float and singletons.\n" + "- For RowWise scaling, a and b should be float8, scales should be float, scale_a should be (", + a.size(0), + ", 1) and scale_b should be (1, ", + b.size(1), + "), and both should be contiguous.\n" + "Got a.dtype()=", + a.scalar_type(), + ", scale_a.dtype()=", + scale_a.scalar_type(), + ", scale_a.size()=", + scale_a.sizes(), + ", scale_a.stride()=", + scale_a.strides(), + ", ", + "b.dtype()=", + b.scalar_type(), + ", scale_b.dtype()=", + scale_b.scalar_type(), + ", scale_b.size()=", + scale_b.sizes(), + " and scale_b.stride()=", + scale_b.strides()); +} + +Tensor& _scaled_gemm( + const Tensor& mat1, + const Tensor& mat2, + const Tensor& scale_a, + const Tensor& scale_b, + const ScalingType scaling_choice_a, + const ScalingType scaling_choice_b, + const std::optional& bias, + const bool use_fast_accum, + Tensor& out, + const std::optional& alpha = std::nullopt) { + // TODO: scale_result and alpha is not defined or used! + std::optional scaled_result = std::nullopt; + at::native::onednn::scaled_matmul( + mat1, + mat2, + out, + scale_a, + scale_b, + scaling_choice_a, + scaling_choice_b, + bias, + scaled_result, + use_fast_accum); + + return out; +} + +} // namespace + +// Computes matrix multiply + bias while applying scaling to input and output +// matrices Scales are only applicable when matrices are of Float8 type and +// assumed to be equal to 1.0 by default. If output matrix type is 16 or 32-bit +// type, scale_result is not applied. Known limitations: +// - Only works if mat1 is row-major and mat2 is column-major +// - Only works if matrices sizes are divisible by 32 +// - If 1-dimensional tensors are used then scale_a should be size = +// mat1.size(0) +// and scale_b should have size = to mat2.size(1) +// Arguments: +// - `mat1`: the first operand of the matrix multiply, can be type +// `torch.float8_e4m3fn` or `torch.float8_e5m2` +// - `mat2`: the second operand of the matrix multiply, can be type +// `torch.float8_e4m3fn` or `torch.float8_e5m2` +// - `bias`: the bias, can be type `torch.float16` or `torch.bfloat16` +// - `out_dtype`: the output dtype, can either be a float8 or a higher +// precision floating point type +// - `scale_a`: a tensor with the inverse scale of `mat1`, whose +// shape/strides/dtype depend on the scaling scheme +// - `scale_b`: a tensor with the inverse scale of `mat2`, whose +// shape/strides/dtype depend on the scaling scheme +// - `scale_result`: a scalar tensor with the scale of the output, only +// utilized if the output is a float8 type +// - `use_fast_accum`: Not applicable for XPU. For now, it should always be +// false. +// - `out`: a reference to the output tensor + +Tensor& _scaled_mm_out_xpu( + const Tensor& mat1, + const Tensor& mat2, + const Tensor& scale_a, + const Tensor& scale_b, + const std::optional& bias, + const std::optional& scale_result, + std::optional out_dtype, + bool use_fast_accum, + Tensor& out) { + // Note: fast_accum is not supported in XPU for now. + TORCH_CHECK(!use_fast_accum, "fast_accum is not supported in XPU for now."); + + TORCH_CHECK(mat1.dim() == 2, "mat1 must be a matrix"); + TORCH_CHECK(mat2.dim() == 2, "mat2 must be a matrix"); + + TORCH_CHECK( + mat1.sizes()[1] == mat2.sizes()[0], + "mat1 and mat2 shapes cannot be multiplied (", + mat1.sizes()[0], + "x", + mat1.sizes()[1], + " and ", + mat2.sizes()[0], + "x", + mat2.sizes()[1], + ")"); + + // Check what type of scaling we are doing based on inputs. This list is + // sorted by decreasing priority. + + // List of supported datatypes for XPU with oneDNN: + // https://uxlfoundation.github.io/oneDNN/dev_guide_matmul.html#data-types + auto [scaling_choice_a, scaling_choice_b] = get_joint_scaling( + { + std::make_pair(ScalingType::TensorWise, ScalingType::TensorWise), + std::make_pair(ScalingType::RowWise, ScalingType::RowWise), + }, + mat1, + mat2, + scale_a, + scale_b); + TORCH_CHECK( + !scale_result || + (scale_result->numel() == 1 && scale_result->scalar_type() == kFloat), + "scale_result must be a float scalar"); + TORCH_CHECK( + !bias || bias->numel() == mat2.sizes()[1], + "Bias must be size ", + mat2.sizes()[1], + " but got ", + bias->numel()); + TORCH_CHECK( + mat1.sizes()[1] % 16 == 0, + "Expected trailing dimension of mat1 to be divisible by 16 ", + "but got mat1 shape: (", + mat1.sizes()[0], + "x", + mat1.sizes()[1], + ")."); + TORCH_CHECK( + mat2.sizes()[0] % 16 == 0 && mat2.sizes()[1] % 16 == 0, + "mat2 shape (", + mat2.sizes()[0], + "x", + mat2.sizes()[1], + ") must be divisible by 16"); + // Check types + TORCH_CHECK( + !out_dtype || *out_dtype == out.scalar_type(), + "out_dtype must match output matrix type"); + TORCH_CHECK( + at::isFloat8Type(mat1.scalar_type()), + "Expected mat1 to be Float8 matrix got ", + mat1.scalar_type()); + TORCH_CHECK( + at::isFloat8Type(mat2.scalar_type()), + "Expected mat2 to be Float8 matrix got ", + mat2.scalar_type()); + // TODO: oneDNN Currently only supports e4m3 with group scales on BMG. Not + // support 2D scales, only 1D. Needs to add more checks there. + + if (bias) { + TORCH_CHECK( + bias->scalar_type() == kFloat || + bias->scalar_type() == c10::ScalarType::BFloat16 || + bias->scalar_type() == c10::ScalarType::Half, + "Bias must be Float32 or BFloat16 or Half, but got ", + bias->scalar_type()); + } + + { + auto bias_ = bias.value_or(Tensor()); + auto scale_result_ = scale_result.value_or(Tensor()); + + // NOLINTNEXTLINE(*c-array*) + TensorArg targs[]{ + {out, "out", 0}, + {mat1, "mat1", 1}, + {mat2, "mat2", 2}, + {bias_, "bias", 3}, + {scale_a, "scale_a", 4}, + {scale_b, "scale_b", 5}, + {scale_result_, "scale_result", 6}}; + checkAllSameGPU(__func__, targs); + } + + // Validation checks have passed lets resize the output to actual size + IntArrayRef mat1_sizes = mat1.sizes(); + IntArrayRef mat2_sizes = mat2.sizes(); + at::native::resize_output(out, {mat1_sizes[0], mat2_sizes[1]}); + + // If any of M, K, N is 0 - return early (the tensorwise/rowwise float8 gemm + // kernels do not support this case). + if (mat1_sizes[0] == 0 || mat1_sizes[1] == 0 || mat2_sizes[1] == 0) { + // `out` was created with `at::empty`. In the case where we are multiplying + // MxK by KxN and K is the zero dim, we need to initialize here to properly + // return a tensor of zeros. + if (mat1_sizes[1] == 0) { + out.zero_(); + } + + return out; + } + + // TODO: Scale_result is not supported by now!! + return _scaled_gemm( + mat1, + mat2, + scale_a, + scale_b, + scaling_choice_a, + scaling_choice_b, + bias, + use_fast_accum, + out); +} + +Tensor _scaled_mm_xpu( + const Tensor& mat_a, + const Tensor& mat_b, + const Tensor& scale_a, + const Tensor& scale_b, + const std::optional& bias, + const std::optional& scale_result, + std::optional out_dtype, + bool use_fast_accum) { + const auto out_dtype_ = out_dtype.value_or(mat_a.scalar_type()); + Tensor out = at::empty({0}, mat_a.options().dtype(out_dtype_)); + return _scaled_mm_out_xpu( + mat_a, + mat_b, + scale_a, + scale_b, + bias, + scale_result, + out_dtype, + use_fast_accum, + out); +} + +using acceptance_fn = std::function&, + ArrayRef&, + c10::ScalarType, + std::vector&, + ArrayRef&)>; +using namespace std::placeholders; + +namespace scaled_blas = at::native::onednn::scaled; +using scaled_blas::convert_int_to_enum; +using scaled_blas::ScaledGemmImplementation; + +std::array, 2> + scale_kernel_dispatch = {{ + {"tensorwise_tensorwise", + scaled_blas::check_tensorwise_recipe, + ScaledGemmImplementation::TENSORWISE_TENSORWISE}, + {"rowwise_rowwise", + scaled_blas::check_rowwise_recipe, + ScaledGemmImplementation::ROWWISE_ROWWISE}, + + }}; + +Tensor& _scaled_tensorwise_tensorwise( + const Tensor& mat_a, + const Tensor& mat_b, + const Tensor& scale_a, + const Tensor& scale_b, + const std::optional& bias, + const c10::ScalarType out_dtype, + bool use_fast_accum, + Tensor& out) { + // Restrictions: + // A, B are FP8, scales are fp32 + + TORCH_CHECK_VALUE( + isFloat8Type(mat_a.scalar_type()) && isFloat8Type(mat_b.scalar_type()), + "mat_a and mat_b must be fp8 types, got: ", + mat_a.scalar_type(), + mat_b.scalar_type()); + TORCH_CHECK_VALUE( + scale_a.numel() == 1 && scale_a.scalar_type() == kFloat, + "scale_a must have 1 Float element") + TORCH_CHECK_VALUE( + scale_b.numel() == 1 && scale_b.scalar_type() == kFloat, + "scale_b must have 1 Float element") + + auto scaling_choice_a = ScalingType::TensorWise; + auto scaling_choice_b = ScalingType::TensorWise; + + _scaled_gemm( + mat_a, + mat_b, + scale_a, + scale_b, + scaling_choice_a, + scaling_choice_b, + bias, + use_fast_accum, + out); + + return out; +} + +Tensor& _scaled_rowwise_rowwise( + const Tensor& mat_a, + const Tensor& mat_b, + const Tensor& scale_a, + const Tensor& scale_b, + const std::optional& bias, + const c10::ScalarType out_dtype, + bool use_fast_accum, + Tensor& out) { + // Restrictions: + // A, B are FP8, scales are fp32, shape M/N for A/B + TORCH_CHECK_VALUE( + isFloat8Type(mat_a.scalar_type()) && isFloat8Type(mat_b.scalar_type()), + "mat_a and mat_b must be fp8 types, got: ", + mat_a.scalar_type(), + mat_b.scalar_type()); + TORCH_CHECK_VALUE( + scale_a.size(0) == mat_a.size(0) && scale_a.size(1) == 1, + "scale_a must have shape [", + mat_a.size(0), + ", 1], got [", + scale_a.sizes(), + "]"); + TORCH_CHECK_VALUE( + scale_a.numel() == mat_a.size(0) && scale_a.scalar_type() == kFloat, + "scale_a must have ", + mat_a.size(0), + " Float elements, got ", + scale_a.numel()) + TORCH_CHECK_VALUE( + scale_b.numel() == mat_b.size(1) && scale_b.scalar_type() == kFloat, + "scale_b must have ", + mat_b.size(1), + " Float elements, got ", + scale_b.numel()) + + TORCH_CHECK_VALUE( + scale_a.stride(1) == 1, + "expected scale_a.stride(1) to be 1, but got ", + scale_a.stride(1)); + TORCH_CHECK_VALUE( + scale_b.stride(1) == 1, + "expected scale_b.stride(1) to be 1, but got ", + scale_b.stride(1)); + + auto scaling_choice_a = ScalingType::RowWise; + auto scaling_choice_b = ScalingType::RowWise; + + _scaled_gemm( + mat_a, + mat_b, + scale_a, + scale_b, + scaling_choice_a, + scaling_choice_b, + bias, + use_fast_accum, + out); + + return out; +} + +// V2: Computes matrix multiply + bias while applying scaling to input and +// output matrices Scales are only applicable when matrices are of Float8 type +// and assumed to be equal to 1.0 by default. If output matrix type is 16 or +// 32-bit type, scale_result is not applied. Known limitations: +// - Only works if mat1 is row-major and mat2 is column-major +// - Only works if matrices sizes are divisible by 32 +// - If 1-dimensional tensors are used then scale_a should be size = +// mat1.size(0) +// and scale_b should have size = to mat2.size(1) +// Arguments: +// - `mat_a`: the first operand of the matrix multiply, can be type +// `torch.float8_e4m3fn` or `torch.float8_e5m2` +// - `mat_b`: the second operand of the matrix multiply, can be type +// `torch.float8_e4m3fn` or `torch.float8_e5m2` +// - `scale_a`: a tensor with the inverse scale of `mat1`, whose +// shape/strides/dtype depend on the scaling scheme +// - `scale_recipe_a`: An integer corresponding to an enum describing the +// scaling scheme used for `scale_a` +// - `swizzle_a`: An integer corresponding to a `SwizzleType` enum describing +// the swizzling scheme for `scale_a`. +// Not supported for XPU for now. +// - `scale_b`: a tensor with the inverse scale of `mat2`, whose +// shape/strides/dtype depend on the scaling scheme +// - `scale_recipe_b`: An integer corresponding to an enum describing the +// scaling scheme used for `scale_b` +// - `swizzle_b`: An integer corresponding to a `SwizzleType` enum describing +// the swizzling scheme for `scale_b`. +// Not supported for XPU for now. +// - `bias`: the bias, can be type `torch.float16` or `torch.bfloat16` +// - `out_dtype`: the output dtype, can either be a float8 or a higher +// precision floating point type +// - `contraction_dim`: describe which dimensions are `K` in the matmul. +// Not supported for XPU. Should always be empty. +// - `use_fast_accum`: Not supported for XPU, should always be false. +// - `out`: a reference to the output tensor +Tensor& _scaled_mm_xpu_v2_out( + const Tensor& mat_a, + const Tensor& mat_b, + ArrayRef scale_a, + IntArrayRef scale_recipe_a, + IntArrayRef swizzle_a, + ArrayRef scale_b, + IntArrayRef scale_recipe_b, + IntArrayRef swizzle_b, + const std::optional& bias, + const std::optional out_dtype, + IntArrayRef contraction_dim, + bool use_fast_accum, + Tensor& out) { + TORCH_CHECK_VALUE(mat_a.dim() == 2, "mat_a must be a matrix"); + TORCH_CHECK_VALUE(mat_b.dim() == 2, "mat_b must be a matrix"); + + // If any of M, K, N is 0 - return early (the tensorwise/rowwise float8 gemm + // kernels do not support this case). + if (mat_a.size(0) == 0 || mat_a.size(1) == 0 || mat_b.size(1) == 0) { + // `out` was created with `at::empty`. In the case where we are multiplying + // MxK by KxN and K is the zero dim, we need to initialize here to properly + // return a tensor of zeros. + at::native::resize_output(out, {mat_a.size(0), mat_b.size(1)}); + if (mat_a.size(1) == 0) { + out.zero_(); + } + + return out; + } + + // Note: The `contraction_dim` is not actually used for now. We will need to + // align this code when upstreamed CUDA code is done. Currently, only keeps + // the code here for check. + + // Check if the input matrix sizes can be multiplied + // - if optional contraction dims are provided, use those + // -- mostly for < 1B formats (i.e. nvfp4x2) where cheap .t() is not + // available. + if (contraction_dim.size() > 0) { + TORCH_CHECK_VALUE( + contraction_dim.size() == 2, + "contraction_dim must have exactly 2 elements"); + auto mat_a_dim = contraction_dim[0]; + auto mat_b_dim = contraction_dim[1]; + TORCH_CHECK_VALUE( + mat_a.size(mat_a_dim) == mat_b.size(mat_b_dim), + "mat_a and mat_b shapes cannot be multiplied (", + mat_a.size(0), + "x", + mat_a.size(1), + " and ", + mat_b.size(0), + "x", + mat_b.size(1), + ") ", + "with contraction dims mat_a: ", + mat_a_dim, + ", mat_b: ", + mat_b_dim); + } else { + TORCH_CHECK_VALUE( + mat_a.size(1) == mat_b.size(0), + "mat_a and mat_b shapes cannot be multiplied (", + mat_a.size(0), + "x", + mat_a.size(1), + " and ", + mat_b.size(0), + "x", + mat_b.size(1), + ")"); + } + + TORCH_CHECK_VALUE( + !bias || bias->numel() == mat_b.sizes()[1], + "Bias must be size ", + mat_b.sizes()[1], + " but got ", + bias->numel()); + + TORCH_CHECK_VALUE( + !out_dtype || *out_dtype == out.scalar_type(), + "out_dtype must match output matrix type"); + + if (bias) { + TORCH_CHECK_VALUE( + bias->scalar_type() == kFloat || + bias->scalar_type() == c10::ScalarType::BFloat16 || + bias->scalar_type() == c10::ScalarType::Half, + "Bias must be Float32 or BFloat16 or Half, but got ", + bias->scalar_type()); + } + { + auto bias_ = bias.value_or(Tensor()); + // NOLINTNEXTLINE(*c-array*) + TensorArg targs[]{ + {out, "out", 0}, + {mat_a, "mat_a", 1}, + {mat_b, "mat_b", 2}, + {bias_, "bias", 3}, + {scale_a[0], "scale_a", 4}, + {scale_b[0], "scale_b", 5}}; + checkAllSameGPU(__func__, targs); + } + // Align with CUDA's default out to be bf16 + auto out_dtype_ = out_dtype.value_or(c10::ScalarType::BFloat16); + + // Conversion of implicitly-defined enums to explicit + auto scale_recipe_a_enum = convert_int_to_enum(scale_recipe_a); + auto swizzle_a_enum = convert_int_to_enum(swizzle_a); + auto scale_recipe_b_enum = convert_int_to_enum(scale_recipe_b); + auto swizzle_b_enum = convert_int_to_enum(swizzle_b); + + // XPU does not support swizzle for now. So directly return false. + TORCH_CHECK_VALUE( + swizzle_a_enum[0] == at::blas::SwizzleType::NO_SWIZZLE && + swizzle_b_enum[0] == at::blas::SwizzleType::NO_SWIZZLE, + "XPU does not support swizzle yet."); + + // at this point we can start working out what we want to be doing + // Try to do as few steps as possible. + // NOTE: support is deliberately sparse, can explicitly enumerate all + // combinations allowed. Do this via a list of defined (name, acceptance, + // concrete_impl) tuples. + bool found_impl = false; + ScaledGemmImplementation gemm_impl = ScaledGemmImplementation::NONE; + + for (const auto& fn_entry : scale_kernel_dispatch) { + const auto [name, accept_fn, scaled_gemm_impl] = fn_entry; + bool ok = accept_fn( + mat_a.scalar_type(), + scale_recipe_a_enum, + scale_a, + mat_b.scalar_type(), + scale_recipe_b_enum, + scale_b); + if (ok) { + gemm_impl = scaled_gemm_impl; + found_impl = true; + break; + } + } + TORCH_CHECK_VALUE( + found_impl, + "Invalid scaling configuration.\n" + "- For TensorWise scaling, a and b should be float8, scales should be float and singletons.\n" + "- For RowWise scaling, a and b should be float8, scales should be float, scale_a should be (", + mat_a.size(0), + ", 1) and scale_b should be (1, ", + mat_b.size(1), + "), and both should be contiguous.\n" + "Got mat_a.dtype()=", + mat_a.scalar_type(), + ", scale_a[0].dtype()=", + scale_a[0].scalar_type(), + ", scale_a[0].size()=", + scale_a[0].sizes(), + ", scale_a[0].stride()=", + scale_a[0].strides(), + ", ", + "mat_b.dtype()=", + mat_b.scalar_type(), + ", scale_b[0].dtype()=", + scale_b[0].scalar_type(), + ", scale_b[0].size()=", + scale_b[0].sizes(), + " and scale_b[0].stride()=", + scale_b[0].strides()); + + at::native::resize_output(out, {mat_a.size(0), mat_b.size(1)}); + + auto bias_ = bias.value_or(Tensor()); + + // dispatch to appropriate lower-level calls for error checking & execution + if (gemm_impl == ScaledGemmImplementation::TENSORWISE_TENSORWISE) { + return _scaled_tensorwise_tensorwise( + mat_a, + mat_b, + scale_a[0], + scale_b[0], + bias, + out_dtype_, + use_fast_accum, + out); + } else if (gemm_impl == ScaledGemmImplementation::ROWWISE_ROWWISE) { + return _scaled_rowwise_rowwise( + mat_a, + mat_b, + scale_a[0], + scale_b[0], + bias, + out_dtype_, + use_fast_accum, + out); + } else { + TORCH_CHECK_VALUE( + false, "Invalid state - found an implementation, but not really"); + } +} + +Tensor _scaled_mm_xpu_v2( + const Tensor& mat_a, + const Tensor& mat_b, + ArrayRef scale_a, + IntArrayRef scale_recipe_a, + IntArrayRef swizzle_a, + ArrayRef scale_b, + IntArrayRef scale_recipe_b, + IntArrayRef swizzle_b, + const std::optional& bias, + const std::optional out_dtype, + IntArrayRef contraction_dim, + bool use_fast_accum) { + const auto out_dtype_ = out_dtype.value_or(mat_a.scalar_type()); + Tensor out = at::empty({0}, mat_a.options().dtype(out_dtype_)); + + return _scaled_mm_xpu_v2_out( + mat_a, + mat_b, + scale_a, + scale_recipe_a, + swizzle_a, + scale_b, + scale_recipe_b, + swizzle_b, + bias, + out_dtype, + contraction_dim, + use_fast_accum, + out); +} + +} // namespace at::native diff --git a/aten/src/ATen/native/mkldnn/xpu/detail/QConv.cpp b/aten/src/ATen/native/mkldnn/xpu/detail/QConv.cpp index 282f42f37a364..4d6cb1b81fac3 100644 --- a/aten/src/ATen/native/mkldnn/xpu/detail/QConv.cpp +++ b/aten/src/ATen/native/mkldnn/xpu/detail/QConv.cpp @@ -133,7 +133,7 @@ at::Tensor quantized_convolution( // supported in conv. mask_weight = weight_zero_points.numel() > 1 ? 1 : 0; if (groups > 1 && weight_zero_points.numel() > 1) - mask_weight = (2 ^ 0) | (2 ^ 1); // 2^0 (group) | 2^1 (output channel) + mask_weight = (1 << 0) | (1 << 1); // 2^0 (group) | 2^1 (output channel) dnnl::primitive_attr pattr; bool src_need_zp = (act_zero_point != 0); diff --git a/aten/src/ATen/native/mkldnn/xpu/detail/QMatmul.cpp b/aten/src/ATen/native/mkldnn/xpu/detail/QMatmul.cpp index ede01093ff3e7..f79dfadd65454 100644 --- a/aten/src/ATen/native/mkldnn/xpu/detail/QMatmul.cpp +++ b/aten/src/ATen/native/mkldnn/xpu/detail/QMatmul.cpp @@ -1,3 +1,4 @@ +#include #include #include #include @@ -8,7 +9,6 @@ #include namespace at::native::onednn { - at::Tensor broadcast_bias2D( at::Tensor& dst, at::Tensor& bias, @@ -328,4 +328,236 @@ void quantized_matmul( result.copy_(dst); } +// Describes how to configure oneDNN scales for a given role/ScalingType +struct ScaleSpec { + // specifies the way scale values will be applied to an ARG tensor. + int mask; + // specifies how scales are grouped along dimensions where + // multiple scale factors are used. + dnnl::memory::dims groups; + // specifies data type for scale factors. + dnnl::memory::data_type dtype; + + // Helper to compute expected number of elements for scale tensors + // arg_type: "src" for SRC (groups pattern {1, X}), + // "wei" for WEIGHTS (groups pattern {X, 1}) + int64_t expected_numel( + int64_t outer_dim, + int64_t inner_dim, + const std::string& arg_type) const { + if (groups == dnnl::memory::dims{1, 1}) + return 1; // tensorwise scaling + + TORCH_CHECK( + arg_type == "src" || arg_type == "wei", + "Expected arg_type to be 'src' or 'wei', but got '", + arg_type, + "'"); + + // For rowwise: SRC groups={1, K}, WEI groups={K, 1} + TORCH_INTERNAL_ASSERT( + (groups == dnnl::memory::dims{1, inner_dim} || + groups == dnnl::memory::dims{inner_dim, 1}), + "The groups must be either {1, inner_dim} or {inner_dim, 1}. But got ", + groups, + "."); + return outer_dim; + } + + // Normalize an incoming scale tensor to contiguous storage and appropriate + // dtype/view + at::Tensor normalize(const at::Tensor& scale) const { + TORCH_INTERNAL_ASSERT( + dtype == dnnl::memory::data_type::f32, + "tensor scale currently must be f32, but got scale dtype: ", + scale.scalar_type()); + return scale.to(at::kFloat).contiguous(); + } +}; + +// This function defines how to set scales mask and groups according to: +// https://github.com/uxlfoundation/oneDNN/blob/main/tests/benchdnn/doc/knobs_attr.md#--attr-scales +// The returned value will be used in +// `set_scales(arg, mask, groups, data_type)`. +inline ScaleSpec make_scale_spec( + at::blas::ScalingType scaling_type, + int64_t M, + int64_t K, + int64_t N, + const std::string& arg_type) { + TORCH_CHECK( + arg_type == "src" || arg_type == "wei", + "Expected arg_type to be 'src' or 'wei', but got '", + arg_type, + "'"); + TORCH_INTERNAL_ASSERT( + (scaling_type == at::blas::ScalingType::TensorWise || + scaling_type == at::blas::ScalingType::RowWise), + "Currently only support scaling_type for TensorWise or RowWise"); + int64_t dim = K; // Currently only K is used for grouping + bool is_src = (arg_type == "src"); + if (scaling_type == at::blas::ScalingType::TensorWise) { + // Scale tensorwise. The same as `--attr-scales=common`. + // mask=0 : scale whole tensor + // groups={1, 1}: indicates that there is only one group for scaling + return {0, {1, 1}, dnnl::memory::data_type::f32}; + } else { + // (scaling_type == at::blas::ScalingType::RowWise) + // Scale RowWise. The same as `--attr-scales=per_dim_01`. + // mask={(1 << 0) | (1 << 1)}: Scale on both dim0 and dim1 + // SRC: groups={1, K}, WEIGHTS: groups={K, 1} + return { + (1 << 0) | (1 << 1), + is_src ? dnnl::memory::dims{1, dim} : dnnl::memory::dims{dim, 1}, + dnnl::memory::data_type::f32}; + } +} + +sycl::event scaled_matmul( + const Tensor& mat1, + const Tensor& mat2, + Tensor& result, + const Tensor& scale_a, + const Tensor& scale_b, + at::blas::ScalingType scaling_choice_a, + at::blas::ScalingType scaling_choice_b, + const std::optional& bias, + const std::optional& scale_result, + bool use_fast_accum) { + auto& engine = GpuEngineManager::Instance().get_engine(); + auto& stream = GpuStreamManager::Instance().get_stream(); + + // This function will do steps with following steps + // 1. create memory descriptor + // 2. call write_to_dnnl_memory() to actually write memory + // 3. execute + + const int64_t M = mat1.size(0); + const int64_t K = mat1.size(1); + const int64_t N = mat2.size(1); + + // 1.1 Create memory descriptor + dnnl::memory::desc src_md = get_onednn_md(mat1); + dnnl::memory::desc weights_md = get_onednn_md(mat2); + dnnl::memory::desc dst_md = get_onednn_md(result); + + // scale_a and scale_b has already be checked in `is_desired_scaling()` call. + // So we could directly get their memory desc and set later. + dnnl::memory::desc scale_a_md = get_onednn_md(scale_a); + dnnl::memory::desc scale_b_md = get_onednn_md(scale_b); + + dnnl::memory::desc bias_md; + bool with_bias = bias.has_value(); + at::Tensor possible_reshaped_bias = bias.value_or(at::Tensor()); + if (with_bias) { + if (possible_reshaped_bias.dim() == 1) { + possible_reshaped_bias = + possible_reshaped_bias.reshape({1, possible_reshaped_bias.size(0)}); + bias_md = get_onednn_md(possible_reshaped_bias); + } else { + bias_md = get_onednn_md(possible_reshaped_bias); + } + } + + // 1.2 Create primitive descriptor and set scales mask + const ScaleSpec src_spec = make_scale_spec(scaling_choice_a, M, K, N, "src"); + const ScaleSpec wei_spec = make_scale_spec(scaling_choice_b, M, K, N, "wei"); + + dnnl::primitive_attr op_attr = dnnl::primitive_attr(); + +#if ONEDNN_SUPPORT_DETERMINISTIC + if (at::globalContext().deterministicAlgorithms() || + at::globalContext().deterministicMkldnn()) + op_attr.set_deterministic(true); +#endif + + std::vector default_groups; + op_attr.set_scales( + DNNL_ARG_SRC, src_spec.mask, src_spec.groups, src_spec.dtype); + op_attr.set_scales( + DNNL_ARG_WEIGHTS, wei_spec.mask, wei_spec.groups, wei_spec.dtype); + // scale_result tensor currently only supports scalar(TensorWise Scaling). + bool with_dst_scale = scale_result && scale_result->defined(); + if (with_dst_scale) { + op_attr.set_scales(DNNL_ARG_DST, 0, {1}, dnnl::memory::data_type::f32); + } + + op_attr.set_scratchpad_mode(dnnl::scratchpad_mode::user); + + // 1.3 Create the matmul primitive descriptor + dnnl::matmul::primitive_desc matmul_pd = with_bias + ? dnnl::matmul::primitive_desc( + engine, src_md, weights_md, bias_md, dst_md, op_attr) + : dnnl::matmul::primitive_desc( + engine, src_md, weights_md, dst_md, op_attr); + + // 1.4 (Possible) Additional Checks + // TODO: In case there are memory desc does not align with the actual tensor, + // we might need to reorder weights similar to CPU's reorder_if_differ_in() + // call. For example, weights not the same as matmul_pd.weights_desc(), + + // 2. Prepare memory + + // Create memory + auto src_usr_m = make_onednn_memory(src_md, engine, mat1.data_ptr()); + auto weights_usr_m = make_onednn_memory(weights_md, engine, mat2.data_ptr()); + auto dst_usr_m = make_onednn_memory(dst_md, engine, result.data_ptr()); + dnnl::memory b_usr_m; + if (with_bias) { + b_usr_m = + make_onednn_memory(bias_md, engine, possible_reshaped_bias.data_ptr()); + } + + // Prepare runtime scale memories (flat 1-D views) using the specs + auto make_scale_mem_from_spec = [&](const ScaleSpec& spec, + int64_t expected_numel, + const at::Tensor& scale_tensor) { + at::Tensor prepared = spec.normalize(scale_tensor); + TORCH_CHECK( + prepared.numel() == expected_numel, + "Scale buffer length mismatch. Expected ", + expected_numel, + ", got ", + prepared.numel()); + dnnl::memory::desc scale_md( + {prepared.numel()}, spec.dtype, dnnl::memory::format_tag::x); + return make_onednn_memory(scale_md, engine, prepared.data_ptr()); + }; + + auto scratchpad = + make_onednn_memory(matmul_pd.scratchpad_desc(), engine, nullptr); + + // 3. Setup Args for exec + std::unordered_map args; + args.insert({DNNL_ARG_SRC, src_usr_m}); + args.insert({DNNL_ARG_WEIGHTS, weights_usr_m}); + args.insert({DNNL_ARG_DST, dst_usr_m}); + args.insert({DNNL_ARG_SCRATCHPAD, scratchpad}); + if (with_bias) { + args.insert({DNNL_ARG_BIAS, b_usr_m}); + } + + // Attach runtime scales using specs + auto src_sc_mem = make_scale_mem_from_spec( + src_spec, src_spec.expected_numel(M, K, "src"), scale_a); + auto wei_sc_mem = make_scale_mem_from_spec( + wei_spec, wei_spec.expected_numel(N, K, "wei"), scale_b); + args.insert({DNNL_ARG_ATTR_SCALES | DNNL_ARG_SRC, src_sc_mem}); + args.insert({DNNL_ARG_ATTR_SCALES | DNNL_ARG_WEIGHTS, wei_sc_mem}); + if (with_dst_scale) { + // Bind single f32 scalar as DST scale + at::Tensor dst_scale_f32 = scale_result->to(at::kFloat).contiguous(); + dnnl::memory::desc dst_sc_md( + {1}, dnnl::memory::data_type::f32, dnnl::memory::format_tag::x); + auto dst_sc_mem = + make_onednn_memory(dst_sc_md, engine, dst_scale_f32.data_ptr()); + args.insert({DNNL_ARG_ATTR_SCALES | DNNL_ARG_DST, dst_sc_mem}); + } + + dnnl::matmul matmul_p = dnnl::matmul(matmul_pd); + sycl::event matmul_fwd_event = + dnnl::sycl_interop::execute(matmul_p, stream, args); + return matmul_fwd_event; +} + } // namespace at::native::onednn diff --git a/aten/src/ATen/native/mkldnn/xpu/detail/Utils.cpp b/aten/src/ATen/native/mkldnn/xpu/detail/Utils.cpp index 15f24e9cbb3a4..a8a6b870ff6b6 100644 --- a/aten/src/ATen/native/mkldnn/xpu/detail/Utils.cpp +++ b/aten/src/ATen/native/mkldnn/xpu/detail/Utils.cpp @@ -78,6 +78,10 @@ dnnl::memory::data_type get_onednn_dtype( return dnnl::memory::data_type::f32; case at::ScalarType::BFloat16: return dnnl::memory::data_type::bf16; + case at::ScalarType::Float8_e4m3fn: + return dnnl::memory::data_type::f8_e4m3; + case at::ScalarType::Float8_e5m2: + return dnnl::memory::data_type::f8_e5m2; default: if (!allow_undef) { TORCH_CHECK( diff --git a/aten/src/ATen/native/mkldnn/xpu/detail/oneDNN.h b/aten/src/ATen/native/mkldnn/xpu/detail/oneDNN.h index 6b2bf01e6d73d..bbe880b672b9d 100644 --- a/aten/src/ATen/native/mkldnn/xpu/detail/oneDNN.h +++ b/aten/src/ATen/native/mkldnn/xpu/detail/oneDNN.h @@ -1,6 +1,7 @@ #pragma once #include +#include #include #include #include @@ -202,4 +203,16 @@ void sdpa_backward( Tensor& grad_query, Tensor& grad_key, Tensor& grad_value); + +sycl::event scaled_matmul( + const Tensor& mat1, + const Tensor& mat2, + Tensor& result, + const Tensor& scale_a, + const Tensor& scale_b, + at::blas::ScalingType scaling_choice_a, + at::blas::ScalingType scaling_choice_b, + const std::optional& bias, + const std::optional& scale_result, + bool use_fast_accum); } // namespace at::native::onednn diff --git a/aten/src/ATen/native/mps/OperationUtils.h b/aten/src/ATen/native/mps/OperationUtils.h index 03b3076402d0a..cb488a3f5f117 100644 --- a/aten/src/ATen/native/mps/OperationUtils.h +++ b/aten/src/ATen/native/mps/OperationUtils.h @@ -40,8 +40,6 @@ using namespace at::mps; namespace at::native::mps { -void dispatch_sync_with_rethrow(dispatch_queue_t queue, void (^block)()); - struct MPSScalar { id getMTLBuffer() const { return __builtin_bit_cast(id, buffer.get()); @@ -84,6 +82,7 @@ NSArray* getTensorAxes(const TensorBase& t); NSArray* getTensorAxes(const IntArrayRef& sizes, at::OptionalIntArrayRef dim); std::string getMPSShapeString(MPSShape* shape); std::string getTensorsStringKey(const TensorList& tensors, bool short_dtype = true, bool exclude_shape = false); +std::string to_hex_key(float); std::string getArrayRefString(const IntArrayRef s); // use has_storage() on the returned tensor to determine if src actually is a view Tensor gatherViewTensor(const Tensor& src, Tensor& dst); diff --git a/aten/src/ATen/native/mps/OperationUtils.mm b/aten/src/ATen/native/mps/OperationUtils.mm index 96cd5d41959c3..196d514a2c580 100644 --- a/aten/src/ATen/native/mps/OperationUtils.mm +++ b/aten/src/ATen/native/mps/OperationUtils.mm @@ -53,21 +53,6 @@ - (MPSGraphTensor*)maximumWithNaNPropagationAndIntFallbackWithPrimaryTensor:(MPS @end namespace at::native::mps { - -void dispatch_sync_with_rethrow(dispatch_queue_t queue, void (^block)()) { - __block std::optional block_exception; - dispatch_sync(queue, ^() { - try { - block(); - } catch (...) { - block_exception = std::current_exception(); - } - }); - if (block_exception) { - std::rethrow_exception(*block_exception); - } -} - /** * Computes distance from lowest to highest element offset in given tensor. */ @@ -316,6 +301,10 @@ MPSDataType getMPSScalarType(ScalarType scalar_type) { return fmt::to_string(fmt::join(s, ",")); } +std::string to_hex_key(float f) { + return fmt::format("{:a}", f); +} + std::string getTensorsStringKey(const TensorList& tensors, bool short_dtype, bool exclude_shape) { fmt::basic_memory_buffer buffer; auto buf_iterator = std::back_inserter(buffer); diff --git a/aten/src/ATen/native/mps/kernels/Indexing.metal b/aten/src/ATen/native/mps/kernels/Indexing.metal index b41e64d70ced5..ebe078d01781e 100644 --- a/aten/src/ATen/native/mps/kernels/Indexing.metal +++ b/aten/src/ATen/native/mps/kernels/Indexing.metal @@ -1,4 +1,5 @@ #include +#include #include #include @@ -31,10 +32,24 @@ OffsetT index_apply_indices( constant IndexAB* indices, constant int64_t* sizes, constant int64_t* strides, - uint num_indices) { + uint num_indices, + thread bool& error, + device ErrorMessages* error_buf) { OffsetT rc = offs.x; for (uint i = 0; i < num_indices; i++) { auto idx = indices[i].indexArray[offs.y]; + if (idx < -sizes[i] || idx >= sizes[i]) { + TORCH_REPORT_ERROR( + error_buf, + "index ", + idx, + " is out of bounds for dimension ", + i, + " with size ", + sizes[i]); + error = true; + break; + } if (idx < 0) { idx += sizes[i]; } @@ -55,6 +70,7 @@ kernel void index_select( constant int64_t* index_sizes, constant int64_t* index_strides, constant uint4& ndim_nindices_numel, + device ErrorMessages* error_buffer, uint thread_index [[thread_position_in_grid]]) { const auto ndim = ndim_nindices_numel.x; const auto num_indices = ndim_nindices_numel.y; @@ -65,8 +81,19 @@ kernel void index_select( indices_strides, ndim, thread_index); + bool error = false; auto input_offs = index_apply_indices( - offs.yz, indices, index_sizes, index_strides, num_indices); + offs.yz, + indices, + index_sizes, + index_strides, + num_indices, + error, + error_buffer); + if (error) { + output[offs.x / sizeof(T)] = 0; + return; + } output[offs.x / sizeof(T)] = input[input_offs / sizeof(T)]; } @@ -82,7 +109,9 @@ inline void index_put_impl( constant int64_t* index_sizes, constant int64_t* index_strides, constant uint4& ndim_nindices_numel, + device ErrorMessages* error_buffer, uint thread_index) { + bool error = false; const auto ndim = ndim_nindices_numel.x; const auto num_indices = ndim_nindices_numel.y; const auto offs = index_get_offsets( @@ -93,7 +122,16 @@ inline void index_put_impl( ndim, thread_index); auto output_offs = index_apply_indices( - offs.xz, indices, index_sizes, index_strides, num_indices); + offs.xz, + indices, + index_sizes, + index_strides, + num_indices, + error, + error_buffer); + if (error) { + return; + } output[output_offs / sizeof(T)] = input[offs.y / sizeof(T)]; } @@ -109,6 +147,7 @@ kernel void index_put( constant int64_t* index_sizes, constant int64_t* index_strides, constant uint4& ndim_nindices_numel, + device ErrorMessages* error_buffer, uint thread_index [[thread_position_in_grid]]) { index_put_impl( output, @@ -121,6 +160,7 @@ kernel void index_put( index_sizes, index_strides, ndim_nindices_numel, + error_buffer, thread_index); } @@ -136,6 +176,7 @@ kernel void index_put_serial( constant int64_t* index_sizes, constant int64_t* index_strides, constant uint4& ndim_nindices_numel, + device ErrorMessages* error_buffer, uint thread_index [[thread_position_in_grid]]) { (void)thread_index; // Suppress unused vairable varning for (uint idx = 0; idx < ndim_nindices_numel.z; ++idx) { @@ -150,6 +191,7 @@ kernel void index_put_serial( index_sizes, index_strides, ndim_nindices_numel, + error_buffer, idx); } } @@ -166,6 +208,7 @@ kernel void index_put_accumulate( constant int64_t* index_sizes, constant int64_t* index_strides, constant uint4& ndim_nindices_numel, + device ErrorMessages* error_buffer, uint thread_index [[thread_position_in_grid]]) { const auto ndim = ndim_nindices_numel.x; const auto num_indices = ndim_nindices_numel.y; @@ -176,8 +219,18 @@ kernel void index_put_accumulate( indices_strides, ndim, thread_index); + bool error = false; auto output_offs = index_apply_indices( - offs.xz, indices, index_sizes, index_strides, num_indices); + offs.xz, + indices, + index_sizes, + index_strides, + num_indices, + error, + error_buffer); + if (error) { + return; + } AtomicType::atomic_add( reinterpret_cast*>(output), output_offs / sizeof(T), @@ -197,6 +250,7 @@ kernel void index_put_accumulate( constant int64_t* index_sizes, \ constant int64_t* index_strides, \ constant uint4& ndim_nindices_numel, \ + device ErrorMessages* error_buffer, \ uint thread_index [[thread_position_in_grid]]) #define REGISTER_INDEX_OP_ALL_DTYPES(OP_NAME) \ diff --git a/aten/src/ATen/native/mps/kernels/LinearAlgebra.metal b/aten/src/ATen/native/mps/kernels/LinearAlgebra.metal index c356dbf9ecb38..ecb2ddefd1fc1 100644 --- a/aten/src/ATen/native/mps/kernels/LinearAlgebra.metal +++ b/aten/src/ATen/native/mps/kernels/LinearAlgebra.metal @@ -40,7 +40,7 @@ inline c10::metal::opmath_t matmul_inner( threadgroup_barrier(mem_flags::mem_threadgroup); for (uint k = 0; k < TILE_DIM; k++) { - sum += A_tile[tid.y][k] * B_tile[k][tid.x]; + sum += c10::metal::mul(A_tile[tid.y][k], B_tile[k][tid.x]); } threadgroup_barrier(mem_flags::mem_threadgroup); @@ -96,7 +96,9 @@ kernel void addmm( auto bias = biasData[thread_id.y * strides[3].x + thread_id.x * strides[3].y]; outputData[thread_id.y * strides[2].x + thread_id.x * strides[2].y] = - static_cast(alpha_beta[0] * sum + alpha_beta[1] * bias); + static_cast( + c10::metal::mul(alpha_beta[0], sum) + + c10::metal::mul(alpha_beta[1], bias)); } } @@ -832,6 +834,10 @@ INSTANTIATE_MM_OPS(float); INSTANTIATE_MM_OPS(half); INSTANTIATE_MM_OPS(bfloat); +// Complex MM +INSTANTIATE_MM_OPS(float2); +INSTANTIATE_MM_OPS(half2); + // Integral MM INSTANTIATE_MM_OPS(long); INSTANTIATE_MM_OPS(int); diff --git a/aten/src/ATen/native/mps/operations/Blas.mm b/aten/src/ATen/native/mps/operations/Blas.mm index 16d744cedb8ef..5ebf5f604bfc1 100644 --- a/aten/src/ATen/native/mps/operations/Blas.mm +++ b/aten/src/ATen/native/mps/operations/Blas.mm @@ -141,6 +141,9 @@ Tensor dot_mps(const Tensor& self, const Tensor& other) { }; MPSStream* stream = at::mps::getCurrentMPSStream(); + if (result.numel() == 0) { + return result; + } Tensor matMulVec = at::mm(mat, vec.unsqueeze(1)).squeeze(1); @autoreleasepool { diff --git a/aten/src/ATen/native/mps/operations/EmbeddingBag.mm b/aten/src/ATen/native/mps/operations/EmbeddingBag.mm index e6690b2531f0d..d7916ccdf875d 100644 --- a/aten/src/ATen/native/mps/operations/EmbeddingBag.mm +++ b/aten/src/ATen/native/mps/operations/EmbeddingBag.mm @@ -220,7 +220,7 @@ Tensor _embedding_bag_dense_backward_mps(const Tensor& output_grad, auto num_threads = (params.mode == EmbeddingBagMode::MAX) ? output_grad.numel() : num_indices * params.feature_size; MPSStream* stream = getCurrentMPSStream(); - mps::dispatch_sync_with_rethrow(stream->queue(), ^() { + dispatch_sync_with_rethrow(stream->queue(), ^() { @autoreleasepool { id computeEncoder = stream->commandEncoder(); auto pipeline_state = lib.getPipelineStateForFunc(fmt::format("embedding_bag_backward_{}_{}", @@ -273,7 +273,7 @@ Tensor _embedding_bag_per_sample_weights_backward_mps(const Tensor& output_grad, auto num_threads = num_indices * feature_size; MPSStream* stream = getCurrentMPSStream(); - mps::dispatch_sync_with_rethrow(stream->queue(), ^() { + dispatch_sync_with_rethrow(stream->queue(), ^() { @autoreleasepool { id computeEncoder = stream->commandEncoder(); auto pipeline_state = lib.getPipelineStateForFunc(fmt::format("embedding_bag_per_sample_weights_backward_{}_{}", diff --git a/aten/src/ATen/native/mps/operations/Indexing.mm b/aten/src/ATen/native/mps/operations/Indexing.mm index 0b0a84c45a52c..2a21f3f8aadca 100644 --- a/aten/src/ATen/native/mps/operations/Indexing.mm +++ b/aten/src/ATen/native/mps/operations/Indexing.mm @@ -179,7 +179,8 @@ static void dispatch_index_kernel(TensorIteratorBase& iter, iter.strides(2), index_size, index_stride, - ndim_nindiees); + ndim_nindiees, + mpsStream->getErrorBuffer()); mtl_dispatch1DJob(computeEncoder, indexSelectPSO, serial ? 1 : iter.numel()); }); } @@ -299,7 +300,7 @@ static Tensor nonzero_fallback(const Tensor& self) { MPSStream* stream = getCurrentMPSStream(); using CachedGraph = MPSUnaryCachedGraph; - dispatch_sync(stream->queue(), ^() { + dispatch_sync_with_rethrow(stream->queue(), ^() { stream->synchronize(SyncType::COMMIT_AND_WAIT); }); int64_t total_nonzero = at::count_nonzero(self).item(); @@ -384,7 +385,7 @@ static Tensor nonzero_fallback(const Tensor& self) { MPSStream* stream = getCurrentMPSStream(); using CachedGraph = MPSUnaryCachedGraph; - dispatch_sync(stream->queue(), ^() { + dispatch_sync_with_rethrow(stream->queue(), ^() { stream->synchronize(SyncType::COMMIT_AND_WAIT); }); int64_t total_nonzero = at::count_nonzero(self).item(); diff --git a/aten/src/ATen/native/mps/operations/LinearAlgebra.mm b/aten/src/ATen/native/mps/operations/LinearAlgebra.mm index aed417ca9ca92..ca19d121bb718 100644 --- a/aten/src/ATen/native/mps/operations/LinearAlgebra.mm +++ b/aten/src/ATen/native/mps/operations/LinearAlgebra.mm @@ -121,7 +121,7 @@ const Scalar& alpha, const Scalar& beta, const Tensor& bias) { - if (beta.toDouble() == 0 && alpha.toDouble() == 1) { + if (beta.isFloatingPoint() && alpha.isFloatingPoint() && beta.toDouble() == 0 && alpha.toDouble() == 1) { return do_metal_mm(self, other, output); } auto stream = getCurrentMPSStream(); @@ -147,13 +147,15 @@ std::array i64; std::array i32; std::array f32; - } alpha_beta; + std::array, 2> c64; + } alpha_beta{}; if (output.scalar_type() == kLong) { alpha_beta.i64 = {alpha.toLong(), beta.toLong()}; } else if (c10::isIntegralType(output.scalar_type(), true)) { alpha_beta.i32 = {alpha.toInt(), beta.toInt()}; + } else if (c10::isComplexType(output.scalar_type())) { + alpha_beta.c64 = {alpha.toComplexFloat(), beta.toComplexFloat()}; } else { - TORCH_INTERNAL_ASSERT(c10::isFloatingType(output.scalar_type())); alpha_beta.f32 = {alpha.toFloat(), beta.toFloat()}; } constexpr uint32_t TILE_DIM = 16; // fastest performance from tests on multiple macs @@ -190,10 +192,16 @@ bool use_metal_mm(const Tensor& self, const Tensor& other, const Tensor& output) { static bool always_use_metal = c10::utils::has_env("PYTORCH_MPS_PREFER_METAL"); constexpr auto max_stride_size = 32768; + constexpr auto max_complex_inner_size = 2048; static bool is_macos_14_4_or_newer = is_macos_13_or_newer(MacOSVersion::MACOS_VER_14_4_PLUS); if (always_use_metal || c10::isIntegralType(self.scalar_type(), true)) { return true; } + // multiplicationWithPrimaryTensor: returns incorrect results if inner size exceeds 2048 + // See https://github.com/pytorch/pytorch/issues/167727#issuecomment-3529308548 + if (c10::isComplexType(self.scalar_type()) && self.size(1) > max_complex_inner_size) { + return true; + } return !is_macos_14_4_or_newer && (self.stride(0) > max_stride_size || self.stride(1) > max_stride_size || self.size(0) > max_stride_size || self.size(1) > max_stride_size || other.stride(0) > max_stride_size || other.stride(1) > max_stride_size || diff --git a/aten/src/ATen/native/mps/operations/LossOps.mm b/aten/src/ATen/native/mps/operations/LossOps.mm index c995b8fc237f3..f0bbcdabfa5cd 100644 --- a/aten/src/ATen/native/mps/operations/LossOps.mm +++ b/aten/src/ATen/native/mps/operations/LossOps.mm @@ -212,17 +212,12 @@ loss.resize_((reduction == Reduction::None || grad_output.defined()) ? target.sizes() : IntArrayRef({})); TORCH_CHECK(loss.is_mps()); - Tensor loss_squeezed = loss.squeeze(); - Tensor input_squeezed = input.squeeze(); - Tensor target_squeezed = target.squeeze(); - @autoreleasepool { - std::string key = - op_name + reductionToString(reduction) + getTensorsStringKey({input_squeezed, target_squeezed, weight}); + std::string key = op_name + reductionToString(reduction) + getTensorsStringKey({input, target, weight}); auto cachedGraph = LookUpOrCreateCachedGraph(key, [&](auto mpsGraph, auto newCachedGraph) { - newCachedGraph->inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, input_squeezed); - newCachedGraph->targetTensor = mpsGraphRankedPlaceHolder(mpsGraph, target_squeezed); + newCachedGraph->inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, input); + newCachedGraph->targetTensor = mpsGraphRankedPlaceHolder(mpsGraph, target); MPSGraphTensor* bceLossUnweighted = nil; // if grad_output is defined, then it's a backward pass @@ -252,12 +247,12 @@ newCachedGraph->gradInputTensor = bceLoss; } } else { - newCachedGraph->lossTensor = reduceTensor(bceLoss, reduction, mpsGraph, input_squeezed.sizes().size()); + newCachedGraph->lossTensor = reduceTensor(bceLoss, reduction, mpsGraph, input.sizes().size()); } }); - Placeholder inputPlaceholder = Placeholder(cachedGraph->inputTensor, input_squeezed); - Placeholder targetPlaceholder = Placeholder(cachedGraph->targetTensor, target_squeezed); - Placeholder lossPlaceholder = Placeholder(cachedGraph->lossTensor, loss_squeezed); + Placeholder inputPlaceholder = Placeholder(cachedGraph->inputTensor, input); + Placeholder targetPlaceholder = Placeholder(cachedGraph->targetTensor, target); + Placeholder lossPlaceholder = Placeholder(cachedGraph->lossTensor, loss); NSMutableDictionary* feeds = [[NSMutableDictionary new] autorelease]; diff --git a/aten/src/ATen/native/mps/operations/Normalization.mm b/aten/src/ATen/native/mps/operations/Normalization.mm index 0c95fec667e80..7441692b6c291 100644 --- a/aten/src/ATen/native/mps/operations/Normalization.mm +++ b/aten/src/ATen/native/mps/operations/Normalization.mm @@ -923,7 +923,7 @@ Check if running mean exists (maybe do this check before making graph) MPSStream* stream = getCurrentMPSStream(); TORCH_CHECK_NOT_IMPLEMENTED(input.scalar_type() != kLong, "Not implemented for long on MPS"); @autoreleasepool { - mps::dispatch_sync_with_rethrow(stream->queue(), ^() { + dispatch_sync_with_rethrow(stream->queue(), ^() { // which kernel variant to use based on the normalized axis N size const int N_READS = 4; auto metalType = mps::scalarToMetalTypeString(input); diff --git a/aten/src/ATen/native/mps/operations/Repeat.mm b/aten/src/ATen/native/mps/operations/Repeat.mm index 40afa15b4f700..f350b0137b05e 100644 --- a/aten/src/ATen/native/mps/operations/Repeat.mm +++ b/aten/src/ATen/native/mps/operations/Repeat.mm @@ -91,26 +91,31 @@ Tensor repeat_mps(const Tensor& self, IntArrayRef repeats) { #include #endif -template -void computeRepeatIndices(const index_t* repeat_ptr, - const int64_t* cumsum_ptr, - index_t* result_ptr, - int64_t size, - int64_t result_size) { - id repeatBuffer = reinterpret_cast>(repeat_ptr); - id cumsumBuffer = reinterpret_cast>(cumsum_ptr); - id resultBuffer = reinterpret_cast>(result_ptr); - TORCH_CHECK(repeatBuffer && cumsumBuffer && resultBuffer); - +Tensor repeat_interleave_mps(const Tensor& repeat, std::optional output_size) { + TORCH_CHECK(repeat.dim() == 1, "repeat_interleave only accept 1D vector as repeat"); std::string scalar_type; - if constexpr (std::is_same_v) { + if (repeat.scalar_type() == kInt) { scalar_type = "int32_t"; - } else if constexpr (std::is_same_v) { + } else if (repeat.scalar_type() == kLong) { scalar_type = "int64_t"; } else { - TORCH_CHECK(false, "repeat_interleave: unsupported indexing data type"); + TORCH_CHECK(false, "repeats has to be Long or Int tensor"); + } + if (repeat.size(0) == 0) { + return at::empty_like(repeat, LEGACY_CONTIGUOUS_MEMORY_FORMAT); + } + Tensor repeat_ = repeat.contiguous(); + Tensor cumsum = repeat.cumsum(0); + int64_t total = 0; + if (output_size.has_value()) { + total = output_size.value(); + } else { + total = cumsum[-1].item(); + TORCH_CHECK((repeat >= 0).all().item(), "repeats can not be negative"); } + auto result = at::empty({total}, repeat.options()); + MPSStream* mpsStream = getCurrentMPSStream(); dispatch_sync(mpsStream->queue(), ^() { @autoreleasepool { @@ -121,20 +126,13 @@ void computeRepeatIndices(const index_t* repeat_ptr, getMPSProfiler().beginProfileKernel(pipelineState, "repeat_interleave:" + scalar_type, false); [computeEncoder setComputePipelineState:pipelineState]; - mps::mtl_setArgs(computeEncoder, repeatBuffer, cumsumBuffer, resultBuffer, size); - mps::mtl_dispatch1DJob(computeEncoder, pipelineState, size); + mps::mtl_setArgs(computeEncoder, repeat_, cumsum, result, repeat.size(0)); + mps::mtl_dispatch1DJob(computeEncoder, pipelineState, repeat.size(0)); getMPSProfiler().endProfileKernel(pipelineState); } }); -} - -Tensor repeat_interleave_mps(const Tensor& repeat, std::optional output_size) { - Tensor output; - AT_DISPATCH_INDEX_TYPES(repeat.scalar_type(), "repeat_interleave_mps", [&]() { - output = repeat_interleave_common>(repeat, output_size); - }); - return output; + return result; } } // namespace at::native diff --git a/aten/src/ATen/native/mps/operations/TensorCompare.mm b/aten/src/ATen/native/mps/operations/TensorCompare.mm index 7b637d896f850..ed659bddd65cc 100644 --- a/aten/src/ATen/native/mps/operations/TensorCompare.mm +++ b/aten/src/ATen/native/mps/operations/TensorCompare.mm @@ -5,6 +5,7 @@ #include #include #include +#include #ifndef AT_PER_OPERATOR_HEADERS #include @@ -89,13 +90,21 @@ static void check_min_max_dims(const OptionalTensorRef clamp_opt, const Tensor& auto clamp_shape = clamp_opt->sizes(); auto input_shape = input_t.sizes(); - TORCH_CHECK(num_clamp_dims <= num_input_dims, - op_name + ": clamp tensor number of dims must not be greater than that of input tensor") + if (num_clamp_dims > num_input_dims) { + auto leading_dims = num_clamp_dims - num_input_dims; + for (int64_t i = 0; i < leading_dims; ++i) { + TORCH_CHECK(clamp_shape[i] == 1, + op_name + ": clamp tensor leading shape must be 1 to broadcast with input tensor"); + } + } - for (int i = 0; i < num_clamp_dims; i++) + auto clamp_idx = num_clamp_dims - 1; + auto input_idx = num_input_dims - 1; + auto common_dims = std::min(num_clamp_dims, num_input_dims); + for (int64_t i = 0; i < common_dims; ++i) // One of the indices is allowed to be 1; will be handled by broadcast - TORCH_CHECK(clamp_shape[num_clamp_dims - 1 - i] == input_shape[num_input_dims - 1 - i] || - clamp_shape[num_clamp_dims - 1 - i] == 1 || input_shape[num_input_dims - 1 - i] == 1, + TORCH_CHECK(clamp_shape[clamp_idx - i] == input_shape[input_idx - i] || clamp_shape[clamp_idx - i] == 1 || + input_shape[input_idx - i] == 1, op_name + ": clamp tensor trailing shape must match input tensor") } } @@ -136,9 +145,6 @@ static void clamp_tensor_out_mps(const Tensor& input_t, auto result_type = output_t.scalar_type(); - IntArrayRef new_min_shape; - IntArrayRef new_max_shape; - auto num_min_dims = min_opt->dim(); auto num_max_dims = max_opt->dim(); auto num_input_dims = input_t.dim(); @@ -146,24 +152,32 @@ static void clamp_tensor_out_mps(const Tensor& input_t, std::vector new_min_arr(num_input_dims); std::vector new_max_arr(num_input_dims); - if (has_min && num_min_dims < num_input_dims) { - fill_new_shape(num_input_dims, num_min_dims, new_min_arr.data(), min_opt->sizes()); - new_min_shape = IntArrayRef(new_min_arr); - } - - if (has_max && num_max_dims < num_input_dims) { - fill_new_shape(num_input_dims, num_max_dims, new_max_arr.data(), max_opt->sizes()); - new_max_shape = IntArrayRef(new_max_arr); - } - Tensor min_opt_tensor; Tensor max_opt_tensor; + auto reshape_clamp_tensor = [&](const OptionalTensorRef clamp_tensor_ref, + int64_t num_clamp_dims, + std::vector& new_shape_storage) -> Tensor { + IntArrayRef clamp_shape = clamp_tensor_ref->sizes(); + bool requires_view = false; + + if (num_clamp_dims > num_input_dims) { + clamp_shape = clamp_shape.slice(num_clamp_dims - num_input_dims); + requires_view = true; + } else if (num_clamp_dims < num_input_dims) { + fill_new_shape(num_input_dims, num_clamp_dims, new_shape_storage.data(), clamp_shape); + clamp_shape = IntArrayRef(new_shape_storage); + requires_view = true; + } + + return requires_view ? (*clamp_tensor_ref).view(clamp_shape) : *clamp_tensor_ref; + }; + if (has_min) { - min_opt_tensor = (num_min_dims < num_input_dims) ? (*min_opt).view(new_min_shape) : *min_opt; + min_opt_tensor = reshape_clamp_tensor(min_opt, num_min_dims, new_min_arr); } if (has_max) { - max_opt_tensor = (num_max_dims < num_input_dims) ? (*max_opt).view(new_max_shape) : *max_opt; + max_opt_tensor = reshape_clamp_tensor(max_opt, num_max_dims, new_max_arr); } @autoreleasepool { @@ -244,8 +258,8 @@ static void clamp_scalar_out_mps(const Tensor& input_t, @autoreleasepool { // the optional min/max refs could affect how we build the cached graph - std::string key = op_name + (has_min ? ("_min:" + std::to_string(min_scalar)) : "") + - (has_max ? ("_max:" + std::to_string(max_scalar)) : "") + "_scalar:" + getTensorsStringKey({input_t}); + std::string key = op_name + (has_min ? ("_min:" + to_hex_key(min_scalar)) : "") + + (has_max ? ("_max:" + to_hex_key(max_scalar)) : "") + "_scalar:" + getTensorsStringKey({input_t}); auto cachedGraph = LookUpOrCreateCachedGraph(key, [&](auto mpsGraph, auto newCachedGraph) { if (has_min) newCachedGraph->minTensor = [mpsGraph constantWithScalar:min_scalar diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml index 4424f51827d45..9a1c7c790afaa 100644 --- a/aten/src/ATen/native/native_functions.yaml +++ b/aten/src/ATen/native/native_functions.yaml @@ -192,6 +192,11 @@ CompositeExplicitAutograd: _assert_tensor_metadata Meta: _assert_tensor_metadata_meta_symint +- func: _async_error(str msg) -> () + dispatch: + CompositeExplicitAutograd: _async_error + Meta: _async_error_meta + - func: _print(str s) -> () dispatch: CompositeExplicitAutograd: _print @@ -2803,7 +2808,7 @@ - func: floor_divide.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) device_check: NoCheck # TensorIterator dispatch: - CPU, CUDA, MPS: floor_divide_out + CPU, CUDA, MPS, MTIA: floor_divide_out SparseCPU, SparseCUDA, SparseMPS: floor_divide_out_sparse_zerodim - func: floor_divide.Scalar(Tensor self, Scalar other) -> Tensor @@ -4220,7 +4225,7 @@ MTIA: mm_out_mtia MPS: mm_out_mps XPU: mm_out_xpu - SparseCPU, SparseCUDA: _sparse_mm_out + SparseCPU, SparseCUDA, SparseMPS: _sparse_mm_out SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: _sparse_csr_mm_out - func: mm.dtype(Tensor self, Tensor mat2, ScalarType out_dtype) -> Tensor @@ -4292,6 +4297,7 @@ dispatch: SparseCPU: sparse_sparse_matmul_cpu SparseCUDA: sparse_sparse_matmul_cuda + SparseMPS: sparse_sparse_matmul_mps autogen: _sparse_sparse_matmul.out - func: mode(Tensor self, int dim=-1, bool keepdim=False) -> (Tensor values, Tensor indices) @@ -4383,7 +4389,7 @@ variants: function, method dispatch: CompositeExplicitAutograd: mv - SparseCPU, SparseCUDA: mv_sparse + SparseCPU, SparseCUDA, SparseMPS: mv_sparse - func: mv.out(Tensor self, Tensor vec, *, Tensor(a!) out) -> Tensor(a!) dispatch: @@ -7512,7 +7518,7 @@ - func: _sparse_mask_projection(Tensor self, Tensor mask, bool accumulate_matches=False) -> Tensor variants: method dispatch: - SparseCPU, SparseCUDA: sparse_mask_projection + SparseCPU, SparseCUDA, SparseMPS: sparse_mask_projection autogen: _sparse_mask_projection.out - func: _to_cpu(Tensor[] tensors) -> Tensor[] @@ -9832,7 +9838,7 @@ structured_delegate: erfinv.out variants: method, function dispatch: - SparseCPU, SparseCUDA: erfinv_sparse + SparseCPU, SparseCUDA, SparseMPS: erfinv_sparse SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: erfinv_sparse_csr tags: pointwise @@ -9841,7 +9847,7 @@ structured_delegate: erfinv.out variants: method dispatch: - SparseCPU, SparseCUDA: erfinv_sparse_ + SparseCPU, SparseCUDA, SparseMPS: erfinv_sparse_ SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: erfinv_sparse_csr_ tags: pointwise @@ -9851,7 +9857,7 @@ structured_inherits: TensorIteratorBase dispatch: CPU, CUDA, MPS: erfinv_out - SparseCPU, SparseCUDA: erfinv_sparse_out + SparseCPU, SparseCUDA, SparseMPS: erfinv_sparse_out SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: erfinv_sparse_csr_out tags: pointwise diff --git a/aten/src/ATen/native/quantized/cpu/qnnpack/buckbuild.bzl b/aten/src/ATen/native/quantized/cpu/qnnpack/buckbuild.bzl index 180442b4b09a4..fecce634ec08c 100644 --- a/aten/src/ATen/native/quantized/cpu/qnnpack/buckbuild.bzl +++ b/aten/src/ATen/native/quantized/cpu/qnnpack/buckbuild.bzl @@ -1,7 +1,7 @@ load("//tools/build_defs:fb_xplat_cxx_library.bzl", "fb_xplat_cxx_library") load("//tools/build_defs:fb_xplat_cxx_test.bzl", "fb_xplat_cxx_test") load("//tools/build_defs:glob_defs.bzl", "subdir_glob") -load("//tools/build_defs:platform_defs.bzl", "ANDROID", "APPLE", "APPLETVOS", "CXX", "IOS", "MACOSX") +load("//tools/build_defs:platform_defs.bzl", "ANDROID", "APPLE", "CXX", "IOS", "MACOSX") # Shared by internal and OSS BUCK def define_qnnpack(third_party, labels = []): @@ -21,7 +21,7 @@ def define_qnnpack(third_party, labels = []): ("src", "requantization/*.h"), ]), header_namespace = "", - apple_sdks = (IOS, MACOSX, APPLETVOS), + apple_sdks = (IOS, MACOSX), compiler_flags = [ "-O2", "-DPYTORCH_QNNPACK_RUNTIME_QUANTIZATION", @@ -82,7 +82,7 @@ def define_qnnpack(third_party, labels = []): ("src", "requantization/*.h"), ]), header_namespace = "", - apple_sdks = (IOS, MACOSX, APPLETVOS), + apple_sdks = (IOS, MACOSX), compiler_flags = [ "-O3", "-ffast-math", @@ -129,7 +129,7 @@ def define_qnnpack(third_party, labels = []): ("src", "requantization/*.h"), ]), header_namespace = "", - apple_sdks = (IOS, MACOSX, APPLETVOS), + apple_sdks = (IOS, MACOSX), compiler_flags = [ "-O3", "-ffast-math", @@ -184,7 +184,7 @@ def define_qnnpack(third_party, labels = []): ("src", "requantization/*.h"), ]), header_namespace = "", - apple_sdks = (IOS, MACOSX, APPLETVOS), + apple_sdks = (IOS, MACOSX), compiler_flags = [ "-O3", "-ffast-math", @@ -236,7 +236,7 @@ def define_qnnpack(third_party, labels = []): ], ), header_namespace = "", - apple_sdks = (IOS, MACOSX, APPLETVOS), + apple_sdks = (IOS, MACOSX), compiler_flags = [ "-DPYTORCH_QNNPACK_RUNTIME_QUANTIZATION", ], @@ -291,7 +291,7 @@ def define_qnnpack(third_party, labels = []): ("src", "qnnpack/*.h"), ("include", "*.h"), ]), - apple_sdks = (IOS, MACOSX, APPLETVOS), + apple_sdks = (IOS, MACOSX), compiler_flags = [ "-O2", "-DPYTORCH_QNNPACK_RUNTIME_QUANTIZATION", @@ -398,7 +398,7 @@ def define_qnnpack(third_party, labels = []): ("src", "requantization/*.h"), ]), header_namespace = "", - apple_sdks = (IOS, MACOSX, APPLETVOS), + apple_sdks = (IOS, MACOSX), compiler_flags = [ "-O3", "-ffast-math", @@ -465,7 +465,7 @@ def define_qnnpack(third_party, labels = []): ("src", "requantization/*.h"), ]), header_namespace = "", - apple_sdks = (IOS, MACOSX, APPLETVOS), + apple_sdks = (IOS, MACOSX), compiler_flags = [ "-DPYTORCH_QNNPACK_RUNTIME_QUANTIZATION", "-Wno-unused-command-line-argument", @@ -525,7 +525,7 @@ def define_qnnpack(third_party, labels = []): ("src", "qnnpack/*.h"), ]), header_namespace = "", - apple_sdks = (IOS, MACOSX, APPLETVOS), + apple_sdks = (IOS, MACOSX), compiler_flags = [ "-O3", "-ffast-math", diff --git a/aten/src/ATen/native/quantized/cpu/qnnpack/test/avgpool-microkernel-tester.h b/aten/src/ATen/native/quantized/cpu/qnnpack/test/avgpool-microkernel-tester.h index 1a425146ad6c2..ac6370f8df29f 100644 --- a/aten/src/ATen/native/quantized/cpu/qnnpack/test/avgpool-microkernel-tester.h +++ b/aten/src/ATen/native/quantized/cpu/qnnpack/test/avgpool-microkernel-tester.h @@ -301,12 +301,12 @@ class AvgPoolMicrokernelTester { ASSERT_NEAR( float(int32_t(y[i * yStride() + k])), yFP[i * kc() + k], 0.5001f) << "at pixel " << i << ", channel " << k << ", n = " << n() - << ", ks = " << kh() << "x" << kw() << " (" << ks() + << ", ks = " << kh() << 'x' << kw() << " (" << ks() << "), kc = " << kc() << ", acc = " << yAcc[i * kc() + k]; ASSERT_EQ( uint32_t(yRef[i * kc() + k]), uint32_t(y[i * yStride() + k])) << "at pixel " << i << ", channel " << k << ", n = " << n() - << ", ks = " << kh() << "x" << kw() << " (" << ks() + << ", ks = " << kh() << 'x' << kw() << " (" << ks() << "), kc = " << kc() << ", acc = " << yAcc[i * kc() + k]; } } @@ -396,12 +396,12 @@ class AvgPoolMicrokernelTester { ASSERT_NEAR( float(int32_t(y[i * yStride() + k])), yFP[i * kc() + k], 0.5001f) << "at pixel " << i << ", channel " << k << ", n = " << n() - << ", ks = " << kh() << "x" << kw() << " (" << ks() + << ", ks = " << kh() << 'x' << kw() << " (" << ks() << "), kc = " << kc() << ", acc = " << yAcc[i * kc() + k]; ASSERT_EQ( uint32_t(yRef[i * kc() + k]), uint32_t(y[i * yStride() + k])) << "at pixel " << i << ", channel " << k << ", n = " << n() - << ", ks = " << kh() << "x" << kw() << " (" << ks() + << ", ks = " << kh() << 'x' << kw() << " (" << ks() << "), kc = " << kc() << ", acc = " << yAcc[i * kc() + k]; } } diff --git a/aten/src/ATen/native/quantized/cpu/qnnpack/test/maxpool-microkernel-tester.h b/aten/src/ATen/native/quantized/cpu/qnnpack/test/maxpool-microkernel-tester.h index e1583a2c058ef..fc94f9666d9d0 100644 --- a/aten/src/ATen/native/quantized/cpu/qnnpack/test/maxpool-microkernel-tester.h +++ b/aten/src/ATen/native/quantized/cpu/qnnpack/test/maxpool-microkernel-tester.h @@ -232,7 +232,7 @@ class MaxPoolMicrokernelTester { ASSERT_EQ( uint32_t(yRef[i * kc() + k]), uint32_t(y[i * yStride() + k])) << "at pixel " << i << ", channel " << k << ", n = " << n() - << ", ks = " << kh() << "x" << kw() << " (" << ks() + << ", ks = " << kh() << 'x' << kw() << " (" << ks() << "), kc = " << kc(); } } diff --git a/aten/src/ATen/native/sparse/cuda/SoftMax.cu b/aten/src/ATen/native/sparse/cuda/SoftMax.cu index d39e41c532553..7e3b502bf6f41 100644 --- a/aten/src/ATen/native/sparse/cuda/SoftMax.cu +++ b/aten/src/ATen/native/sparse/cuda/SoftMax.cu @@ -30,10 +30,12 @@ #include #include +#include +#include +#include #include #include #include -#include #include #include @@ -47,6 +49,7 @@ #include #include #include +#include #include #include #include diff --git a/aten/src/ATen/native/sparse/mps/SparseMPSTensorMath.mm b/aten/src/ATen/native/sparse/mps/SparseMPSTensorMath.mm index 5dbee4e38af7b..3da1cb5da53c8 100644 --- a/aten/src/ATen/native/sparse/mps/SparseMPSTensorMath.mm +++ b/aten/src/ATen/native/sparse/mps/SparseMPSTensorMath.mm @@ -10,6 +10,10 @@ #include #else #include +#include +#include +#include +#include #include #include #include @@ -441,6 +445,33 @@ Tensor addmm_sparse_dense_mps( return out; } +static std::tuple mps_intersect_binary_search( + const Tensor& A_keys, + const Tensor& B_keys, + int64_t lenA, + int64_t lenB, + bool boolean_flag) { + + auto stream = getCurrentMPSStream(); + auto outA_idx = at::empty({lenA}, A_keys.options().dtype(at::kLong)); + auto outB_idx = at::empty({lenA}, A_keys.options().dtype(at::kLong)); + auto counter = at::zeros({1}, A_keys.options().dtype(at::kInt)); + + dispatch_sync_with_rethrow(stream->queue(), ^() { + @autoreleasepool { + auto pso = lib.getPipelineStateForFunc("intersect_binary_search"); + auto enc = stream->commandEncoder(); + [enc setComputePipelineState:pso]; + mtl_setArgs(enc, A_keys, B_keys, outA_idx, outB_idx, counter, + static_cast(lenB), boolean_flag); + mtl_dispatch1DJob(enc, pso, static_cast(lenA)); + } + }); + + const auto match_count = static_cast(counter.item()); + return std::make_tuple(std::move(outA_idx), std::move(outB_idx), match_count); +} + SparseTensor& mul_out_sparse_mps(const Tensor& t_, const Tensor& src_, SparseTensor& r_) { TORCH_CHECK(r_.is_mps(), "mul: expected 'out' to be MPS, but got ", r_.device()); @@ -519,22 +550,10 @@ Tensor addmm_sparse_dense_mps( auto A_keys = A_is_lhs ? lhs_keys : rhs_keys; auto B_keys = A_is_lhs ? rhs_keys : lhs_keys; - auto outA_idx = at::empty({lenA}, at::device(device).dtype(kLong)); - auto outB_idx = at::empty({lenA}, at::device(device).dtype(kLong)); - auto counter = at::zeros({1}, at::device(device).dtype(kInt)); + auto [outA_idx, outB_idx, M_int64] = mps_intersect_binary_search( + A_keys, B_keys, lenA, lenB, A_is_lhs); - dispatch_sync_with_rethrow(stream->queue(), ^() { - @autoreleasepool { - auto pso = lib.getPipelineStateForFunc("intersect_binary_search"); - auto enc = stream->commandEncoder(); - [enc setComputePipelineState:pso]; - mtl_setArgs(enc, A_keys, B_keys, outA_idx, outB_idx, counter, - static_cast(lenB), A_is_lhs); - mtl_dispatch1DJob(enc, pso, static_cast(lenA)); - } - }); - - const uint32_t M = counter.item(); // number of structural matches + const auto M = static_cast(M_int64); // number of structural matches r_.resize_as_(lhs); @@ -758,6 +777,14 @@ Tensor addmm_sparse_dense_mps( using OptTensor = std::optional; +static Tensor create_sparse_output_values( + const Tensor& template_values, + int64_t output_nnz, + ScalarType dtype) { + auto out_val_sizes = template_values.sizes().vec(); + out_val_sizes[0] = output_nnz; + return at::zeros(out_val_sizes, template_values.options().dtype(dtype)); +} static void sparse_mask_apply_out_mps_kernel( Tensor& result, @@ -779,9 +806,9 @@ static void sparse_mask_apply_out_mps_kernel( auto src = src_in.coalesce(); auto mask = coalesce_mask ? mask_in.coalesce() : mask_in; - const int64_t src_nnz = src._nnz(); - const int64_t mask_nnz = mask._nnz(); - const int64_t sd = src.sparse_dim(); + const auto src_nnz = src._nnz(); + const auto mask_nnz = mask._nnz(); + const auto sd = src.sparse_dim(); result.sparse_resize_(mask.sizes(), mask.sparse_dim(), mask.dense_dim()); auto commonDtype = at::result_type(src, mask); @@ -810,53 +837,27 @@ static void sparse_mask_apply_out_mps_kernel( return; } + auto mask_indices = mask._indices().contiguous(); + auto src_values = src._values().to(commonDtype).contiguous(); + auto out_values = create_sparse_output_values(src_values, mask_nnz, commonDtype); + if (src_nnz == 0) { - auto out_indices = mask._indices().contiguous(); - auto src_values = src._values().to(commonDtype); - auto out_val_sizes = src_values.sizes().vec(); - out_val_sizes[0] = mask_nnz; - auto out_values = at::zeros(out_val_sizes, src_values.options()); - alias_into_sparse(result, out_indices, out_values); + alias_into_sparse(result, mask_indices, out_values); result._coalesced_(mask.is_coalesced()); return; } - auto mask_indices = mask._indices().contiguous(); - auto src_indices = src._indices().contiguous(); - auto src_values = src._values().to(commonDtype).contiguous(); - - auto mask_keys = flatten_indices(mask_indices, mask.sizes().slice(0, sd)).contiguous(); - auto src_keys = flatten_indices(src_indices, src.sizes().slice(0, sd)).contiguous(); + auto mask_keys = flatten_indices(mask._indices().contiguous(), mask.sizes().slice(0, sd)).contiguous(); + auto src_keys = flatten_indices(src._indices().contiguous(), src.sizes().slice(0, sd)).contiguous(); - const bool A_is_src = (src_nnz <= mask_nnz); - const int64_t lenA = A_is_src ? src_nnz : mask_nnz; - const int64_t lenB = A_is_src ? mask_nnz : src_nnz; + const auto A_is_src = (src_nnz <= mask_nnz); + const auto lenA = A_is_src ? src_nnz : mask_nnz; + const auto lenB = A_is_src ? mask_nnz : src_nnz; auto A_keys = A_is_src ? src_keys : mask_keys; auto B_keys = A_is_src ? mask_keys : src_keys; - const auto device = result.device(); - auto stream = getCurrentMPSStream(); - - auto outA_idx = at::empty({lenA}, at::device(device).dtype(at::kLong)); - auto outB_idx = at::empty({lenA}, at::device(device).dtype(at::kLong)); - auto counter = at::zeros({1}, at::device(device).dtype(at::kInt)); - - dispatch_sync_with_rethrow(stream->queue(), ^() { - @autoreleasepool { - auto pso = lib.getPipelineStateForFunc("intersect_binary_search"); - auto enc = stream->commandEncoder(); - [enc setComputePipelineState:pso]; - mtl_setArgs(enc, A_keys, B_keys, outA_idx, outB_idx, counter, - static_cast(lenB), A_is_src); - mtl_dispatch1DJob(enc, pso, static_cast(lenA)); - } - }); - - const int64_t M = static_cast(counter.item()); - - auto out_val_sizes = src_values.sizes().vec(); - out_val_sizes[0] = mask_nnz; - auto out_values = at::zeros(out_val_sizes, src_values.options()); + auto [outA_idx, outB_idx, M] = mps_intersect_binary_search( + A_keys, B_keys, lenA, lenB, A_is_src); if (M > 0) { auto src_match = outA_idx.narrow(0, 0, M); @@ -874,6 +875,70 @@ static void sparse_mask_apply_out_mps_kernel( result._coalesced_(mask.is_coalesced()); } +static void sparse_mask_projection_out_mps_kernel( + Tensor& result, + const Tensor& lhs, + const Tensor& rhs, + const OptTensor& /*x_hash_opt*/, + bool accumulate_matches) { + + TORCH_CHECK(lhs.is_sparse() && rhs.is_sparse(), "sparse_mask_projection: expected sparse COO"); + TORCH_CHECK(lhs.is_mps() && rhs.is_mps(), "sparse_mask_projection: expected MPS tensors"); + TORCH_CHECK(lhs.sparse_dim() == rhs.sparse_dim(), "sparse_dim mismatch"); + + auto lhs_c = lhs.coalesce(); + auto rhs_c = rhs.coalesce(); + + const auto sd = lhs_c.sparse_dim(); + const auto lhs_nnz = lhs_c._nnz(); + const auto rhs_nnz = rhs_c._nnz(); + + auto commonDtype = at::result_type(lhs_c, rhs_c); + TORCH_CHECK(canCast(commonDtype, result.scalar_type()), + "Can't convert ", commonDtype, " to output ", result.scalar_type()); + + result.sparse_resize_(lhs.sizes(), lhs.sparse_dim(), lhs.dense_dim()); + + auto lhs_indices = lhs_c._indices().contiguous(); + auto rhs_values = rhs_c._values().to(commonDtype).contiguous(); + auto out_values = create_sparse_output_values(rhs_values, lhs_nnz, commonDtype); + + if (lhs_nnz > 0 && rhs_nnz > 0) { + auto lhs_keys = flatten_indices(lhs_indices, lhs_c.sizes().slice(0, sd)).contiguous(); + auto rhs_keys = flatten_indices(rhs_c._indices().contiguous(), rhs_c.sizes().slice(0, sd)).contiguous(); + + const auto A_is_lhs = (lhs_nnz <= rhs_nnz); + const auto lenA = A_is_lhs ? lhs_nnz : rhs_nnz; + const auto lenB = A_is_lhs ? rhs_nnz : lhs_nnz; + auto A_keys = A_is_lhs ? lhs_keys : rhs_keys; + auto B_keys = A_is_lhs ? rhs_keys : lhs_keys; + + auto [outA_idx, outB_idx, M] = mps_intersect_binary_search( + A_keys, B_keys, lenA, lenB, A_is_lhs); + + if (M > 0) { + auto idx_in_A = outA_idx.narrow(0, 0, M); + auto idx_in_B = outB_idx.narrow(0, 0, M); + auto idx_in_lhs = A_is_lhs ? idx_in_A : idx_in_B; + auto idx_in_rhs = A_is_lhs ? idx_in_B : idx_in_A; + + const auto view_cols = rhs_values.numel() / std::max(rhs_nnz, 1); + auto rhs_rows = rhs_values.index_select(0, idx_in_rhs).contiguous(); + auto rhs_rows_2d = rhs_rows.view({M, view_cols}); + auto out_2d = out_values.view({lhs_nnz, view_cols}); + + if (accumulate_matches) { + out_2d.index_add_(0, idx_in_lhs, rhs_rows_2d); + } else { + out_2d.index_copy_(0, idx_in_lhs, rhs_rows_2d); + } + } + } + + alias_into_sparse(result, lhs._indices(), out_values); + result._coalesced_(lhs.is_coalesced()); +} + static void sparse_mask_intersection_out_mps_kernel( Tensor& result, const Tensor& lhs, @@ -888,5 +953,115 @@ static void sparse_mask_intersection_out_mps_kernel( /*coalesce_mask=*/false); } +Tensor sparse_sparse_matmul_mps(const Tensor& mat1_, const Tensor& mat2_) { + TORCH_CHECK(mat1_.is_sparse() && mat2_.is_sparse(), + "sparse_sparse_matmul_mps: both inputs must be sparse COO tensors"); + TORCH_CHECK(mat1_.is_mps() && mat2_.is_mps(), + "sparse_sparse_matmul_mps: both inputs must be on MPS device"); + TORCH_CHECK(mat1_.dim() == 2 && mat2_.dim() == 2, + "sparse_sparse_matmul_mps: both inputs must be 2D matrices"); + TORCH_CHECK(mat1_.dense_dim() == 0 && mat2_.dense_dim() == 0, + "sparse_sparse_matmul_mps: only scalar values supported (dense_dim == 0)"); + TORCH_CHECK(mat1_.size(1) == mat2_.size(0), + "mat1 and mat2 shapes cannot be multiplied (", mat1_.size(0), "x", mat1_.size(1), " and ", mat2_.size(0), "x", mat2_.size(1), ")"); + TORCH_CHECK(mat1_.scalar_type() == mat2_.scalar_type(), + "sparse_sparse_matmul_mps: mat1 dtype ", mat1_.scalar_type(), + " does not match mat2 dtype ", mat2_.scalar_type()); + + const auto device = mat1_.device(); + + auto A = mat1_.coalesce(); + auto B = mat2_.coalesce(); + + const auto I = A.size(0); + const auto K = A.size(1); + const auto N = B.size(1); + + const auto nnzA = A._nnz(); + const auto nnzB = B._nnz(); + + // Early empty result, return an empty, coalesced tensor + if (I == 0 || N == 0 || K == 0 || nnzA == 0 || nnzB == 0) { + auto empty_idx = at::empty({2, 0}, at::device(device).dtype(at::kLong)); + auto empty_val = at::empty({0}, at::device(device).dtype(mat1_.scalar_type())); + auto out = _sparse_coo_tensor_unsafe(empty_idx, empty_val, {I, N}, mat1_.options()); + out._coalesced_(true); + return out; + } + + const auto computeDtype = at::result_type(mat1_, mat2_); + + auto A_idx = A._indices().contiguous(); + auto A_val = A._values().to(computeDtype).contiguous(); + auto A_i = A_idx.select(0, 0).contiguous(); + auto A_k = A_idx.select(0, 1).contiguous(); + + auto B_idx = B._indices().contiguous(); + auto B_val = B._values().to(computeDtype).contiguous(); + auto B_k = B_idx.select(0, 0).contiguous(); + auto B_j = B_idx.select(0, 1).contiguous(); + + // csr-style row pointers for B by k (the shared dimension) + Tensor row_ptr_B; + { + auto batch_ptr = at::tensor({0LL, nnzB}, at::device(device).dtype(at::kLong)); + row_ptr_B = at::empty({K + 1}, at::device(device).dtype(at::kLong)); + build_row_ptr_per_batch_mps(B_k, batch_ptr, /*B=*/1, /*I=*/K, row_ptr_B); + } + + auto row_ptr_B_lo = row_ptr_B.narrow(0, 0, K); + auto row_ptr_B_hi = row_ptr_B.narrow(0, 1, K); + auto deg_B = row_ptr_B_hi.sub(row_ptr_B_lo); + + auto counts = deg_B.index_select(0, A_k); + + const int64_t P = counts.sum().item(); + if (P == 0) { + auto empty_idx = at::empty({2, 0}, at::device(device).dtype(at::kLong)); + auto empty_val = at::empty({0}, at::device(device).dtype(mat1_.scalar_type())); + auto out = _sparse_coo_tensor_unsafe(empty_idx, empty_val, {I, N}, mat1_.options()); + out._coalesced_(true); + return out; + } + + auto group_ids = repeat_interleave_mps(counts); + + // exclusive cumsum of counts + auto offsets = cumsum(counts, /*dim=*/0).sub(counts); + auto offsets_gather = offsets.index_select(0, group_ids); + auto within = at::arange(P, at::device(device).dtype(at::kLong)).sub(offsets_gather); + + // Map each output element to its source B row and position + auto k_per_out = A_k.index_select(0, group_ids); + auto start_in_B = row_ptr_B.index_select(0, k_per_out); + auto seg_index = start_in_B.add(within); + + // Assemble candidate coo pairs and values + auto i_out = A_i.index_select(0, group_ids).contiguous(); + auto j_out = B_j.index_select(0, seg_index).contiguous(); + auto vA_out = A_val.index_select(0, group_ids).contiguous(); + auto vB_out = B_val.index_select(0, seg_index).contiguous(); + auto v_out = vA_out.mul(vB_out); + + // build (2, P) indices + auto out_indices = at::empty({2, P}, at::device(device).dtype(at::kLong)).contiguous(); + out_indices.select(0, 0).copy_(i_out); + out_indices.select(0, 1).copy_(j_out); + + auto result = _sparse_coo_tensor_unsafe( + out_indices, v_out, {I, N}, mat1_.options().dtype(computeDtype)); + + result = result.coalesce(); + + if (result.scalar_type() != mat1_.scalar_type()) { + auto cast_vals = result._values().to(mat1_.scalar_type()); + auto out = _sparse_coo_tensor_unsafe(result._indices(), cast_vals, {I, N}, mat1_.options()); + out._coalesced_(true); + return out; + } + return result; +} + REGISTER_MPS_DISPATCH(sparse_mask_intersection_out_stub, &sparse_mask_intersection_out_mps_kernel); +REGISTER_MPS_DISPATCH(sparse_mask_projection_out_stub, &sparse_mask_projection_out_mps_kernel); } // namespace at::native \ No newline at end of file diff --git a/aten/src/ATen/native/transformers/cuda/sdp_utils.cpp b/aten/src/ATen/native/transformers/cuda/sdp_utils.cpp index 7fce73151b00f..a6742a7cb9e78 100644 --- a/aten/src/ATen/native/transformers/cuda/sdp_utils.cpp +++ b/aten/src/ATen/native/transformers/cuda/sdp_utils.cpp @@ -478,7 +478,7 @@ bool check_cudnn_tensor_shapes(sdp_params const& params, bool debug) { const auto s_k = params.key.sym_size(2); const auto d_qk = params.query.sym_size(3); const auto d_v = params.value.sym_size(3); - long cudnn_version = at::detail::getCUDAHooks().versionCuDNN(); + long cudnn_version = at::detail::getCUDAHooks().versionRuntimeCuDNN(); if (cudnn_version < 8903) { if (debug) { TORCH_WARN("SDPA fprop requires cudnn 8.9.3 or higher"); @@ -709,7 +709,7 @@ bool can_use_cudnn_attention(const sdp_params& params, bool debug) { return false; #endif #if defined(CUDNN_VERSION) - static auto cudnn_version = cudnnGetVersion(); + static auto cudnn_version = at::detail::getCUDAHooks().versionRuntimeCuDNN(); if (params.dropout > 0.0 && cudnn_version > 91100 && cudnn_version < 91400) { if (debug) { TORCH_WARN(CUDNN_VERSION, " cuDNN version does not support droppout in SDPA (9.11 - 9.13)."); diff --git a/aten/src/ATen/native/utils/ParamUtils.h b/aten/src/ATen/native/utils/ParamUtils.h index c9088c03d81c1..8887664df1ce3 100644 --- a/aten/src/ATen/native/utils/ParamUtils.h +++ b/aten/src/ATen/native/utils/ParamUtils.h @@ -17,7 +17,7 @@ inline std::vector _expand_param_if_needed( std::ostringstream ss; ss << "expected " << param_name << " to be a single integer value or a " << "list of " << expected_dim << " values to match the convolution " - << "dimensions, but got " << param_name << "=" << list_param; + << "dimensions, but got " << param_name << '=' << list_param; TORCH_CHECK(false, ss.str()); } else { return list_param.vec(); diff --git a/aten/src/ATen/native/vulkan/api/Adapter.cpp b/aten/src/ATen/native/vulkan/api/Adapter.cpp index 173479a0c2de0..350df39ea3684 100644 --- a/aten/src/ATen/native/vulkan/api/Adapter.cpp +++ b/aten/src/ATen/native/vulkan/api/Adapter.cpp @@ -358,9 +358,9 @@ std::string Adapter::stringize() const { std::string device_type = get_device_type_str(properties.deviceType); VkPhysicalDeviceLimits limits = properties.limits; - ss << "{" << std::endl; + ss << '{' << std::endl; ss << " Physical Device Info {" << std::endl; - ss << " apiVersion: " << v_major << "." << v_minor << std::endl; + ss << " apiVersion: " << v_major << '.' << v_minor << std::endl; ss << " driverversion: " << properties.driverVersion << std::endl; ss << " deviceType: " << device_type << std::endl; ss << " deviceName: " << properties.deviceName << std::endl; @@ -371,7 +371,7 @@ std::string Adapter::stringize() const { #define PRINT_LIMIT_PROP_VEC3(name) \ ss << " " << std::left << std::setw(36) << #name << limits.name[0] \ - << "," << limits.name[1] << "," << limits.name[2] << std::endl; + << ',' << limits.name[1] << ',' << limits.name[2] << std::endl; ss << " Physical Device Limits {" << std::endl; PRINT_LIMIT_PROP(maxImageDimension1D); @@ -425,7 +425,7 @@ std::string Adapter::stringize() const { ; } ss << " ]" << std::endl; - ss << "}"; + ss << '}'; return ss.str(); } diff --git a/aten/src/ATen/native/vulkan/api/Exception.cpp b/aten/src/ATen/native/vulkan/api/Exception.cpp index 9b8b653e0619e..436b38cbba6c6 100644 --- a/aten/src/ATen/native/vulkan/api/Exception.cpp +++ b/aten/src/ATen/native/vulkan/api/Exception.cpp @@ -33,7 +33,7 @@ std::ostream& operator<<(std::ostream& out, const VkResult result) { VK_RESULT_CASE(VK_ERROR_FORMAT_NOT_SUPPORTED) VK_RESULT_CASE(VK_ERROR_FRAGMENTED_POOL) default: - out << "VK_ERROR_UNKNOWN (VkResult " << result << ")"; + out << "VK_ERROR_UNKNOWN (VkResult " << result << ')'; break; } return out; @@ -46,7 +46,7 @@ std::ostream& operator<<(std::ostream& out, const VkResult result) { // std::ostream& operator<<(std::ostream& out, const SourceLocation& loc) { - out << loc.function << " at " << loc.file << ":" << loc.line; + out << loc.function << " at " << loc.file << ':' << loc.line; return out; } @@ -66,7 +66,7 @@ Error::Error(SourceLocation source_location, const char* cond, std::string msg) : msg_(std::move(msg)), source_location_{source_location} { std::ostringstream oss; oss << "Exception raised from " << source_location_ << ": "; - oss << "(" << cond << ") is false! "; + oss << '(' << cond << ") is false! "; oss << msg_; what_ = oss.str(); } diff --git a/aten/src/ATen/native/vulkan/api/QueryPool.cpp b/aten/src/ATen/native/vulkan/api/QueryPool.cpp index bfa92357daeed..63c163aa44aa9 100644 --- a/aten/src/ATen/native/vulkan/api/QueryPool.cpp +++ b/aten/src/ATen/native/vulkan/api/QueryPool.cpp @@ -173,8 +173,8 @@ void QueryPool::extract_results() { static std::string stringize(const VkExtent3D& extents) { std::stringstream ss; - ss << "{" << extents.width << ", " << extents.height << ", " << extents.depth - << "}"; + ss << '{' << extents.width << ", " << extents.height << ", " << extents.depth + << '}'; return ss.str(); } diff --git a/aten/src/ATen/native/vulkan/api/Runtime.cpp b/aten/src/ATen/native/vulkan/api/Runtime.cpp index cf8402e40a0b8..a7485b706c54e 100644 --- a/aten/src/ATen/native/vulkan/api/Runtime.cpp +++ b/aten/src/ATen/native/vulkan/api/Runtime.cpp @@ -149,7 +149,7 @@ VKAPI_ATTR VkBool32 VKAPI_CALL debug_report_callback_fn( (void)flags; std::stringstream stream; - stream << layer_prefix << " " << message_code << " " << message << std::endl; + stream << layer_prefix << ' ' << message_code << ' ' << message << std::endl; const std::string log = stream.str(); std::cout << log; diff --git a/aten/src/ATen/native/vulkan/api/Utils.h b/aten/src/ATen/native/vulkan/api/Utils.h index 3172c9c461079..8cd6a74c1c467 100644 --- a/aten/src/ATen/native/vulkan/api/Utils.h +++ b/aten/src/ATen/native/vulkan/api/Utils.h @@ -253,7 +253,7 @@ using vec4 = vec<4u>; // uvec3 is the type representing tensor extents. Useful for debugging. inline std::ostream& operator<<(std::ostream& os, const uvec3& v) { - os << "(" << v.data[0u] << ", " << v.data[1u] << ", " << v.data[2u] << ")"; + os << '(' << v.data[0u] << ", " << v.data[1u] << ", " << v.data[2u] << ')'; return os; } diff --git a/aten/src/ATen/test/CMakeLists.txt b/aten/src/ATen/test/CMakeLists.txt index 81b3ce90b36bf..a522e7ab76cf4 100644 --- a/aten/src/ATen/test/CMakeLists.txt +++ b/aten/src/ATen/test/CMakeLists.txt @@ -61,6 +61,7 @@ list(APPEND ATen_CUDA_TEST_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/cuda_complex_math_test.cu ${CMAKE_CURRENT_SOURCE_DIR}/cuda_complex_test.cu ${CMAKE_CURRENT_SOURCE_DIR}/cuda_cub_test.cu + ${CMAKE_CURRENT_SOURCE_DIR}/cuda_cublas_handle_pool_test.cpp ${CMAKE_CURRENT_SOURCE_DIR}/cuda_device_test.cpp ${CMAKE_CURRENT_SOURCE_DIR}/cuda_distributions_test.cu ${CMAKE_CURRENT_SOURCE_DIR}/cuda_dlconvertor_test.cpp diff --git a/aten/src/ATen/test/basic.cpp b/aten/src/ATen/test/basic.cpp index 0937de4552821..33fe4121a040e 100644 --- a/aten/src/ATen/test/basic.cpp +++ b/aten/src/ATen/test/basic.cpp @@ -246,7 +246,7 @@ void TestToCFloat() { void TestToString() { Tensor b = ones({3, 7}) * .0000001f; std::stringstream s; - s << b << "\n"; + s << b << '\n'; std::string expect = "1e-07 *"; ASSERT_EQ_RESOLVED(s.str().substr(0, expect.size()), expect); } diff --git a/aten/src/ATen/test/cuda_cublas_handle_pool_test.cpp b/aten/src/ATen/test/cuda_cublas_handle_pool_test.cpp new file mode 100644 index 0000000000000..535bb3d1cc2ea --- /dev/null +++ b/aten/src/ATen/test/cuda_cublas_handle_pool_test.cpp @@ -0,0 +1,77 @@ +#include + +#include +#include +#include + +#include +#include +#include + +// Test concurrent access to getCurrentCUDABlasHandle and getCUDABlasLtWorkspace +// to verify that the data race fix is working correctly + +TEST(CUDABlasHandlePoolTest, ConcurrentGetAndClearWorkspaces) { + if (!at::cuda::is_available()) { + return; + } + + constexpr int num_accessor_threads = 15; + constexpr int num_clear_threads = 5; + constexpr int iterations_per_thread = 50; + + std::atomic stop{false}; + std::atomic error_count{0}; + std::vector threads; + threads.reserve(num_accessor_threads + num_clear_threads); + + // Launch accessor threads + for (int i = 0; i < num_accessor_threads; ++i) { + threads.emplace_back([&stop, &error_count]() { + try { + at::cuda::CUDAGuard device_guard(0); + + while (!stop.load(std::memory_order_relaxed)) { + const auto handle = at::cuda::getCurrentCUDABlasHandle(); + const auto workspace = at::cuda::getCUDABlasLtWorkspace(); + + if (handle == nullptr || workspace == nullptr) { + error_count++; + } + } + } catch (const std::exception& e) { + error_count++; + } + }); + } + + // Launch threads that clear workspaces + for (int i = 0; i < num_clear_threads; ++i) { + threads.emplace_back([&error_count]() { + try { + for (int j = 0; j < iterations_per_thread; ++j) { + at::cuda::clearCublasWorkspaces(); + std::this_thread::yield(); + } + } catch (const std::exception& e) { + error_count++; + } + }); + } + + // Let them run for a bit + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + stop.store(true, std::memory_order_relaxed); + + for (auto& thread : threads) { + thread.join(); + } + + EXPECT_EQ(error_count.load(), 0); +} + +int main(int argc, char* argv[]) { + ::testing::InitGoogleTest(&argc, argv); + c10::cuda::CUDACachingAllocator::init(1); + return RUN_ALL_TESTS(); +} diff --git a/aten/src/ATen/test/scalar_test.cpp b/aten/src/ATen/test/scalar_test.cpp index 0d7b62b44d214..a22fb0d16adf8 100644 --- a/aten/src/ATen/test/scalar_test.cpp +++ b/aten/src/ATen/test/scalar_test.cpp @@ -33,7 +33,7 @@ struct Foo { static void apply(Tensor a, Tensor b) { scalar_type s = 1; std::stringstream ss; - ss << "hello, dispatch: " << a.toString() << s << "\n"; + ss << "hello, dispatch: " << a.toString() << s << '\n'; auto data = (scalar_type*)a.data_ptr(); (void)data; } @@ -73,8 +73,8 @@ TEST(TestScalar, TestScalar) { Scalar bar = 3.0; Half h = bar.toHalf(); Scalar h2 = h; - cout << "H2: " << h2.toDouble() << " " << what.toFloat() << " " - << bar.toDouble() << " " << what.isIntegral(false) << "\n"; + cout << "H2: " << h2.toDouble() << ' ' << what.toFloat() << ' ' + << bar.toDouble() << ' ' << what.isIntegral(false) << '\n'; auto gen = at::detail::getDefaultCPUGenerator(); { // See Note [Acquire lock when using random generators] @@ -84,7 +84,7 @@ TEST(TestScalar, TestScalar) { } if (at::hasCUDA()) { auto t2 = zeros({4, 4}, at::kCUDA); - cout << &t2 << "\n"; + cout << &t2 << '\n'; } auto t = ones({4, 4}); @@ -129,7 +129,7 @@ TEST(TestScalar, TestScalar) { std::stringstream ss; // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto) ASSERT_NO_THROW( - ss << "hello, dispatch" << x.toString() << s << "\n"); + ss << "hello, dispatch" << x.toString() << s << '\n'); auto data = (scalar_t*)x.data_ptr(); (void)data; }); diff --git a/aten/src/ATen/test/test_install/main.cpp b/aten/src/ATen/test/test_install/main.cpp index e9a03d2303a39..3a57e0c6212bf 100644 --- a/aten/src/ATen/test/test_install/main.cpp +++ b/aten/src/ATen/test/test_install/main.cpp @@ -1,5 +1,5 @@ #include int main() { - std::cout << at::ones({3,4}, at::CPU(at::kFloat)) << "\n"; + std::cout << at::ones({3,4}, at::CPU(at::kFloat)) << '\n'; } diff --git a/aten/src/ATen/test/vec_test_all_types.cpp b/aten/src/ATen/test/vec_test_all_types.cpp index da0da76109569..c0c05c1484175 100644 --- a/aten/src/ATen/test/vec_test_all_types.cpp +++ b/aten/src/ATen/test/vec_test_all_types.cpp @@ -1828,9 +1828,9 @@ namespace { #endif EXPECT_EQ(u16, c10::detail::fp16_ieee_from_fp32_value(f32s[i])) - << "Test failed for float to uint16 " << f32s[i] << "\n"; + << "Test failed for float to uint16 " << f32s[i] << '\n'; EXPECT_EQ(x, c10::detail::fp16_ieee_to_fp32_value(u16)) - << "Test failed for uint16 to float " << u16 << "\n"; + << "Test failed for uint16 to float " << u16 << '\n'; } } TEST(FP8E4M3Test, FP8E4M3ConversionFloat) { @@ -1848,10 +1848,10 @@ namespace { EXPECT_TRUE(std::isnan(f32)); } else { EXPECT_EQ(f32, c10::detail::fp8e4m3fn_to_fp32_value(input)) - << "Test failed for u8 to float " << input << "\n"; + << "Test failed for u8 to float " << input << '\n'; } EXPECT_EQ(u8, c10::detail::fp8e4m3fn_from_fp32_value(f32)) - << "Test failed for float to u8 " << f32 << "\n"; + << "Test failed for float to u8 " << f32 << '\n'; } } TEST(FP8E4M3Test, FP8E4M3BinaryAdd) { @@ -2015,10 +2015,10 @@ namespace { EXPECT_TRUE(std::isnan(f32)); } else { EXPECT_EQ(f32, c10::detail::fp8e5m2_to_fp32_value(input)) - << "Test failed for u8 to float " << input << "\n"; + << "Test failed for u8 to float " << input << '\n'; } EXPECT_EQ(u8, c10::detail::fp8e5m2_from_fp32_value(f32)) - << "Test failed for float to u8 " << f32 << "\n"; + << "Test failed for float to u8 " << f32 << '\n'; } } TEST(FP8E5M2Test, FP8E5M2BinaryAdd) { diff --git a/aten/src/ATen/test/vitals.cpp b/aten/src/ATen/test/vitals.cpp index cc93775bb5383..eaf1cc152bc37 100644 --- a/aten/src/ATen/test/vitals.cpp +++ b/aten/src/ATen/test/vitals.cpp @@ -19,7 +19,7 @@ TEST(Vitals, Basic) { c10::utils::set_env("TORCH_VITAL", "1"); TORCH_VITAL_DEFINE(Testing); TORCH_VITAL(Testing, Attribute0) << 1; - TORCH_VITAL(Testing, Attribute1) << "1"; + TORCH_VITAL(Testing, Attribute1) << '1'; TORCH_VITAL(Testing, Attribute2) << 1.0f; TORCH_VITAL(Testing, Attribute3) << 1.0; auto t = at::ones({1, 1}); diff --git a/aten/src/ATen/test/vulkan_api_test.cpp b/aten/src/ATen/test/vulkan_api_test.cpp index 396ea59d2f008..29f01fbd78c51 100644 --- a/aten/src/ATen/test/vulkan_api_test.cpp +++ b/aten/src/ATen/test/vulkan_api_test.cpp @@ -129,14 +129,14 @@ void showRtol(const at::Tensor& a, const at::Tensor& b) { std::cout << "Max Diff allowed: " << maxDiff << std::endl; if (diff.sizes().size() == 2) { for (const auto y : c10::irange(diff.sizes()[0])) { - std::cout << y << ":"; + std::cout << y << ':'; for (const auto x : c10::irange(diff.sizes()[1])) { float diff_xy = diff[y][x].item(); if (diff_xy > maxDiff) { std::cout << std::setw(5) << x; } else { - std::cout << std::setw(5) << " "; + std::cout << std::setw(5) << ' '; } } std::cout << std::endl; @@ -3276,7 +3276,7 @@ TEST_F(VulkanAPITest, masked_fill_invalidinputs_exceptions) { void print_shape(const std::vector& shape) { for (const auto& num : shape) { - std::cout << num << " "; + std::cout << num << ' '; } } @@ -3367,7 +3367,7 @@ void test_masked_fill_scalar( print_shape(tmp_curr_input_shape); std::cout << "], and mask of shape ["; print_shape(tmp_curr_mask_shape); - std::cout << "]" << std::endl; + std::cout << ']' << std::endl; } ASSERT_TRUE(check); @@ -4542,9 +4542,9 @@ void test_softmax(const at::IntArrayRef shape, bool log_softmax = false) { if (!check) { std::cout << "Softmax test failed on axis " << dim << "for tensor dims {"; for (uint32_t place = 0; place < shape.size() - 1; place++) { - std::cout << shape[place] << " "; + std::cout << shape[place] << ' '; } - std::cout << shape.back() << "}" << std::endl; + std::cout << shape.back() << '}' << std::endl; showRtol(out_cpu, out_vulkan.cpu()); } ASSERT_TRUE(check); diff --git a/aten/src/ATen/test/vulkan_quantized_api_test.cpp b/aten/src/ATen/test/vulkan_quantized_api_test.cpp index 2829aed94def9..2eff421a64ced 100644 --- a/aten/src/ATen/test/vulkan_quantized_api_test.cpp +++ b/aten/src/ATen/test/vulkan_quantized_api_test.cpp @@ -95,7 +95,7 @@ void showRtol( std::cout << "Max Diff found is: " << diff.max().item() << std::endl; if (diff.sizes().size() == 2) { for (const auto y : c10::irange(diff.sizes()[0])) { - std::cout << y << ":"; + std::cout << y << ':'; for (const auto x : c10::irange(diff.sizes()[1])) { double diff_xy = diff[y][x].item(); if (diff_xy > maxDiff) { @@ -109,7 +109,7 @@ void showRtol( } } } else { - std::cout << std::setw(5) << " "; + std::cout << std::setw(5) << ' '; } } std::cout << std::endl; @@ -148,19 +148,19 @@ using at::native::vulkan::api::utils::ivec4; using at::native::vulkan::api::utils::vec4; std::ostream& operator<<(std::ostream& os, const vec4& v) { - os << "(" << v.data[0u] << ", " << v.data[1u] << ", " << v.data[2u] << ", " - << v.data[3u] << ")"; + os << '(' << v.data[0u] << ", " << v.data[1u] << ", " << v.data[2u] << ", " + << v.data[3u] << ')'; return os; } std::ostream& operator<<(std::ostream& os, const ivec3& v) { - os << "(" << v.data[0u] << ", " << v.data[1u] << ", " << v.data[2u] << ")"; + os << '(' << v.data[0u] << ", " << v.data[1u] << ", " << v.data[2u] << ')'; return os; } std::ostream& operator<<(std::ostream& os, const ivec4& v) { - os << "(" << v.data[0u] << ", " << v.data[1u] << ", " << v.data[2u] << ", " - << v.data[3u] << ")"; + os << '(' << v.data[0u] << ", " << v.data[1u] << ", " << v.data[2u] << ", " + << v.data[3u] << ')'; return os; } @@ -3379,51 +3379,51 @@ bool _test_quantized_linear( showRtol(out_cpu_dequant, out_vk_to_cpu_dequant); } if (xpos != -1 && ypos != -1) { - std::cout << "\nFailure caused on row/col: " << ypos << "/" << xpos - << "\n"; + std::cout << "\nFailure caused on row/col: " << ypos << '/' << xpos + << '\n'; std::cout << "Input tensor scale: " << scale << " zerop: " << zero_point - << "\n"; - std::cout << "Input tensor row " << ypos << "\n"; + << '\n'; + std::cout << "Input tensor row " << ypos << '\n'; for (int i = 0; i < input_cpu.sizes()[1]; i++) { std::cout << input_cpu[ypos][i].item() << ", "; } - std::cout << "\n"; + std::cout << '\n'; std::cout << "Weight tensor scale: " << w_scale - << " zerop: " << w_zero_point << "\n"; - std::cout << "Weight tensor col " << xpos << "\n"; + << " zerop: " << w_zero_point << '\n'; + std::cout << "Weight tensor col " << xpos << '\n'; for (int i = 0; i < weight.sizes()[1]; i++) { std::cout << weight[xpos][i].item() << ", "; } - std::cout << "\n"; + std::cout << '\n'; std::cout << "Input tensor quantized row " << ypos << " with dtype " - << (input_quant_dtype_int8 ? "QInt8" : "QUInt8") << "\n"; + << (input_quant_dtype_int8 ? "QInt8" : "QUInt8") << '\n'; for (int i = 0; i < input_cpu.sizes()[1]; i++) { std::cout << input_cpu_quantized[ypos][i].item() << ", "; } - std::cout << "\n"; + std::cout << '\n'; std::cout << "Weight tensor quantized col " << xpos << " with dtype " - << (weight_quant_dtype_int8 ? "QInt8" : "QUInt8") << "\n"; + << (weight_quant_dtype_int8 ? "QInt8" : "QUInt8") << '\n'; for (int i = 0; i < weight.sizes()[1]; i++) { std::cout << weight_cpu_quantized[xpos][i].item() << ", "; } - std::cout << "\n"; + std::cout << '\n'; std::cout << "bias tensor\n"; for (int i = 0; i < bias.sizes()[0]; i++) { std::cout << bias[i].item() << ", "; } - std::cout << "\n"; + std::cout << '\n'; std::cout << "out_scale: " << out_scale - << " out_zero_point: " << out_zero_point << "\n"; + << " out_zero_point: " << out_zero_point << '\n'; std::cout << "cpu unmatched output: " - << out_cpu_dequant[ypos][xpos].item() << "\n"; + << out_cpu_dequant[ypos][xpos].item() << '\n'; std::cout << "vk unmatched output: " - << out_vk_to_cpu_dequant[ypos][xpos].item() << "\n"; + << out_vk_to_cpu_dequant[ypos][xpos].item() << '\n'; } } return check; diff --git a/aten/src/ATen/xpu/XPUEvent.h b/aten/src/ATen/xpu/XPUEvent.h index 19d42aae080f1..f33fd70ac0619 100644 --- a/aten/src/ATen/xpu/XPUEvent.h +++ b/aten/src/ATen/xpu/XPUEvent.h @@ -1,191 +1,3 @@ #pragma once #include - -#include - -namespace at::xpu { - -/* - * XPUEvent are movable not copyable wrappers around SYCL event. XPUEvent are - * constructed lazily when first recorded. It has a device, and this device is - * acquired from the first recording stream. Later streams that record the event - * must match the same device. - * - * Currently, XPUEvent does NOT support to export an inter-process event from - * another process via inter-process communication(IPC). So it means that - * inter-process communication for event handles between different processes is - * not available. This could impact some applications that rely on cross-process - * synchronization and communication. - */ -struct TORCH_XPU_API XPUEvent { - // Constructors - XPUEvent(bool enable_timing = false) noexcept - : enable_timing_{enable_timing} {} - - ~XPUEvent() { - if (isCreated()) { - const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace(); - if (C10_UNLIKELY(interp)) { - (*interp)->trace_gpu_event_deletion( - at::kXPU, reinterpret_cast(event_.get())); - } - } - } - - XPUEvent(const XPUEvent&) = delete; - XPUEvent& operator=(const XPUEvent&) = delete; - - XPUEvent(XPUEvent&& other) = default; - XPUEvent& operator=(XPUEvent&& other) = default; - - operator sycl::event&() const { - return event(); - } - - std::optional device() const { - if (isCreated()) { - return at::Device(at::kXPU, device_index_); - } else { - return std::nullopt; - } - } - - inline bool isCreated() const { - return (event_.get() != nullptr); - } - - DeviceIndex device_index() const { - return device_index_; - } - - sycl::event& event() const { - return *event_; - } - - bool query() const { - using namespace sycl::info; - if (!isCreated()) { - return true; - } - - return event().get_info() == - event_command_status::complete; - } - - void record() { - record(getCurrentXPUStream()); - } - - void recordOnce(const XPUStream& stream) { - if (!isCreated()) { - record(stream); - } - } - - void record(const XPUStream& stream) { - if (!isCreated()) { - device_index_ = stream.device_index(); - assignEvent(stream.queue()); - const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace(); - if (C10_UNLIKELY(interp)) { - (*interp)->trace_gpu_event_creation( - at::kXPU, reinterpret_cast(event_.get())); - } - } else { - TORCH_CHECK( - device_index_ == stream.device_index(), - "Event device ", - device_index_, - " does not match recording stream's device ", - stream.device_index(), - "."); - reassignEvent(stream.queue()); - } - const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace(); - if (C10_UNLIKELY(interp)) { - (*interp)->trace_gpu_event_record( - at::kXPU, - reinterpret_cast(event_.get()), - reinterpret_cast(&stream.queue())); - } - } - - void block(const XPUStream& stream) { - if (isCreated()) { - std::vector event_list{event()}; - // Make this stream wait until event_ is completed. - stream.queue().ext_oneapi_submit_barrier(event_list); - const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace(); - if (C10_UNLIKELY(interp)) { - (*interp)->trace_gpu_event_wait( - at::kXPU, - reinterpret_cast(event_.get()), - reinterpret_cast(&stream.queue())); - } - } - } - - double elapsed_time(const XPUEvent& other) const { - TORCH_CHECK( - isCreated() && other.isCreated(), - "Both events must be recorded before calculating elapsed time."); - TORCH_CHECK( - query() && other.query(), - "Both events must be completed before calculating elapsed time."); - TORCH_CHECK( - enable_timing_ && other.enable_timing_, - "Both events must be created with argument 'enable_timing=True'."); - -#if SYCL_COMPILER_VERSION < 20250000 - TORCH_CHECK_NOT_IMPLEMENTED( - false, - "elapsed_time of XPUEvent requires PyTorch to be built with SYCL compiler version 2025.0.0 or newer."); -#endif - - using namespace sycl::info::event_profiling; - // Block until both of the recorded events are completed. - uint64_t end_time_ns = other.event().get_profiling_info(); - uint64_t start_time_ns = event().get_profiling_info(); - // Return the eplased time in milliseconds. - return 1e-6 * - (static_cast(end_time_ns) - static_cast(start_time_ns)); - } - - void synchronize() const { - if (isCreated()) { - const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace(); - if (C10_UNLIKELY(interp)) { - (*interp)->trace_gpu_event_synchronization( - at::kXPU, reinterpret_cast(event_.get())); - } - event().wait_and_throw(); - } - } - - private: - void assignEvent(sycl::queue& queue) { -#if SYCL_COMPILER_VERSION >= 20250000 - if (enable_timing_) { - event_ = std::make_unique( - sycl::ext::oneapi::experimental::submit_profiling_tag(queue)); - } else { - event_ = std::make_unique(queue.ext_oneapi_submit_barrier()); - } -#else - event_ = std::make_unique(queue.ext_oneapi_submit_barrier()); -#endif - } - - void reassignEvent(sycl::queue& queue) { - event_.reset(); - assignEvent(queue); - } - - bool enable_timing_ = false; - DeviceIndex device_index_ = -1; - // Only need to track the last event, as events in an in-order queue are - // executed sequentially. - std::unique_ptr event_; -}; - -} // namespace at::xpu +#include diff --git a/aten/src/ATen/xpu/XPUScaledBlas.cpp b/aten/src/ATen/xpu/XPUScaledBlas.cpp new file mode 100644 index 0000000000000..ea7e043da40ec --- /dev/null +++ b/aten/src/ATen/xpu/XPUScaledBlas.cpp @@ -0,0 +1,122 @@ +#include +#include +#include +#include +#include +#include +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#ifndef AT_PER_OPERATOR_HEADERS +#include +#include +#else +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#endif + +using at::blas::ScalingType; + +namespace at::native::onednn::scaled { + +/** + * Both inputs must be fp8, + * Each needs a single scale, {Tensorwise (float)} + */ +bool check_tensorwise_recipe( + c10::ScalarType type_a, + std::vector& recipe_a, + ArrayRef& scales_a, + c10::ScalarType type_b, + std::vector& recipe_b, + ArrayRef& scales_b) { + // both types must be fp8 + if (!isFloat8Type(type_a) || !isFloat8Type(type_b)) { + return false; + } + + // 1 scale each, {Tensorwise, float} + if (scales_a.size() != 1 || recipe_a.size() != 1 || scales_b.size() != 1 || + recipe_b.size() != 1) { + return false; + } + // Need {Blockwise_1x32, e8m0} for A & B + if (recipe_a[0] != ScalingType::TensorWise) + return false; + if (scales_a[0].scalar_type() != ScalarType::Float) + return false; + if (recipe_b[0] != ScalingType::TensorWise) + return false; + if (scales_b[0].scalar_type() != ScalarType::Float) + return false; + + return true; +} + +/** + * Both inputs must be fp8, + * Each needs scales, {Rowwise (float)} + */ +bool check_rowwise_recipe( + c10::ScalarType type_a, + std::vector& recipe_a, + ArrayRef& scales_a, + c10::ScalarType type_b, + std::vector& recipe_b, + ArrayRef& scales_b) { + // both types must be fp8 + if (!isFloat8Type(type_a) || !isFloat8Type(type_b)) { + return false; + } + + // 1 scale each, {Tensorwise, float} + if (scales_a.size() != 1 || recipe_a.size() != 1 || scales_b.size() != 1 || + recipe_b.size() != 1) { + return false; + } + + // Need {RowWise, dp32} for A & B + if (recipe_a[0] != ScalingType::RowWise) + return false; + if (scales_a[0].scalar_type() != ScalarType::Float) + return false; + if (recipe_b[0] != ScalingType::RowWise) + return false; + if (scales_b[0].scalar_type() != ScalarType::Float) + return false; + + return true; +} + +} // namespace at::native::onednn::scaled diff --git a/aten/src/ATen/xpu/XPUScaledBlas.h b/aten/src/ATen/xpu/XPUScaledBlas.h new file mode 100644 index 0000000000000..2940dbfc56dfe --- /dev/null +++ b/aten/src/ATen/xpu/XPUScaledBlas.h @@ -0,0 +1,95 @@ +#include +#include +#include +#include +#include +#include +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#ifdef USE_FBGEMM_GENAI +#include +#endif + +#ifndef AT_PER_OPERATOR_HEADERS +#include +#include +#else +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#endif + +using at::blas::ScalingType; + +namespace at::native::onednn::scaled { + +/** + * Track concrete implementations available + */ +enum class ScaledGemmImplementation { + NONE = 0, + TENSORWISE_TENSORWISE = 1, + ROWWISE_ROWWISE = 2, +}; + +/** + * Convert passed int (enum) from python back into a + * strictly-typed enum + */ +template +std::vector convert_int_to_enum(ArrayType& v) { + std::vector converted; + converted.reserve(v.size()); + + for (auto vi : v) { + converted.push_back(static_cast(vi)); + } + return converted; +} + +bool check_tensorwise_recipe( + c10::ScalarType, + std::vector&, + ArrayRef&, + c10::ScalarType, + std::vector&, + ArrayRef&); + +bool check_rowwise_recipe( + c10::ScalarType, + std::vector&, + ArrayRef&, + c10::ScalarType, + std::vector&, + ArrayRef&); + +} // namespace at::native::onednn::scaled diff --git a/aten/tools/valgrind.sup b/aten/tools/valgrind.sup index ad5f66e0b0531..585487c4d2be2 100644 --- a/aten/tools/valgrind.sup +++ b/aten/tools/valgrind.sup @@ -10,6 +10,13 @@ ... } +{ + ignore_empty_generic_uninitialised_conditional_jump + Memcheck:Cond + fun:_ZN2at6detail13empty_genericEN3c108ArrayRefIlEEPNS1_9AllocatorENS1_14DispatchKeySetENS1_10ScalarTypeESt8optionalINS1_12MemoryFormatEE + ... +} + { Cond_cuda Memcheck:Cond diff --git a/benchmarks/dynamo/check_accuracy.py b/benchmarks/dynamo/check_accuracy.py index 83cca8b36b993..7f8be84b93fd7 100644 --- a/benchmarks/dynamo/check_accuracy.py +++ b/benchmarks/dynamo/check_accuracy.py @@ -50,6 +50,7 @@ def check_accuracy(actual_csv, expected_csv, expected_filename): "mobilenet_v2", "pytorch_CycleGAN_and_pix2pix", "pytorch_stargan", + "repvgg_a2", "resnet152", "resnet18", "resnet50", diff --git a/benchmarks/dynamo/check_perf_csv.py b/benchmarks/dynamo/check_perf_csv.py index 320a4544f829b..08070dda4444c 100644 --- a/benchmarks/dynamo/check_perf_csv.py +++ b/benchmarks/dynamo/check_perf_csv.py @@ -9,28 +9,61 @@ def check_perf_csv(filename, threshold, threshold_scale): """ Basic performance checking. """ + try: + df = pd.read_csv(filename) + except FileNotFoundError: + print(f"Error: File {filename} not found") + sys.exit(1) - df = pd.read_csv(filename) + effective_threshold = threshold * threshold_scale + print(f"Checking {filename} (speedup threshold >= {effective_threshold:.2f}x)\n") failed = [] for _, row in df.iterrows(): model_name = row["name"] - speedup = row["speedup"] - if speedup < threshold * threshold_scale: - failed.append(model_name) + speedup = float(row["speedup"]) + abs_latency = float(row["abs_latency"]) + compilation_latency = float(row["compilation_latency"]) + compression_ratio = float(row["compression_ratio"]) + eager_peak_mem = float(row["eager_peak_mem"]) + dynamo_peak_mem = float(row["dynamo_peak_mem"]) + + perf_summary = f"{model_name:34} speedup={speedup:.3f}x" + if pd.notna(abs_latency): + perf_summary += f", latency={abs_latency:.1f} ms/iter" + if pd.notna(compilation_latency): + perf_summary += f", compile={compilation_latency:.3f}s" + if pd.notna(compression_ratio): + perf_summary += f", mem_ratio={1 / compression_ratio:.2f}x" + if pd.notna(eager_peak_mem) and pd.notna(dynamo_peak_mem): + perf_summary += ( + f" (eager={eager_peak_mem:.1f} GB, dynamo={dynamo_peak_mem:.1f} GB)" + ) + + if speedup < effective_threshold: + failed.append((model_name, speedup)) - print(f"{model_name:34} {speedup}") + print(perf_summary) if failed: print( textwrap.dedent( f""" - Error {len(failed)} models performance regressed - {" ".join(failed)} + Error {len(failed)} model(s) performance regressed + {" ".join([name for name, _ in failed])} """ ) ) + for name, sp in sorted(failed, key=lambda x: x[1]): + pct_from_target = (sp / effective_threshold - 1.0) * 100.0 + print( + f" - {name}: {sp:.3f}x (< {effective_threshold:.2f}x; {pct_from_target:.1f}% from target)" + ) sys.exit(1) + else: + print( + f"\nAll {len(df)} model(s) passed threshold check (>= {effective_threshold:.2f}x)" + ) if __name__ == "__main__": @@ -44,7 +77,7 @@ def check_perf_csv(filename, threshold, threshold_scale): "-s", type=float, default=1.0, - help="multiple threshold by this value to relax the check", + help="multiply threshold by this value to relax the check", ) args = parser.parse_args() check_perf_csv(args.file, args.threshold, args.threshold_scale) diff --git a/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_timm_training.csv b/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_timm_training.csv index b5e457e58997d..b2f40504a4991 100644 --- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_timm_training.csv +++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_timm_training.csv @@ -10,7 +10,7 @@ beit_base_patch16_224,pass,7 -convnextv2_nano.fcmae_ft_in22k_in1k,pass,7 +convnextv2_nano.fcmae_ft_in22k_in1k,fail_accuracy,7 @@ -66,7 +66,7 @@ visformer_small,pass,7 -vit_base_patch14_dinov2.lvd142m,pass,7 +vit_base_patch14_dinov2.lvd142m,fail_accuracy,7 diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_inductor_timm_training.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_inductor_timm_training.csv index b2071874b70d6..2d087e6595526 100644 --- a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_inductor_timm_training.csv +++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_inductor_timm_training.csv @@ -50,7 +50,7 @@ nfnet_l0,pass,7 -repvgg_a2,fail_accuracy,7 +repvgg_a2,pass,7 diff --git a/benchmarks/dynamo/common.py b/benchmarks/dynamo/common.py index e0681f52586e7..b3484e7196a83 100644 --- a/benchmarks/dynamo/common.py +++ b/benchmarks/dynamo/common.py @@ -952,7 +952,7 @@ def latency_experiment_summary(suite_name, args, model, timings, **kwargs): first_fields.append(kwargs["tag"]) headers = first_headers + ["speedup", "abs_latency"] row = first_fields + [float(speedup), median[1] * 1000] - msg = f"{speedup:.3f}x" + msg = f"{median[0] * 1000} ms, {median[1] * 1000} ms, {speedup:.3f}x" if args.baseline: headers.extend( [ @@ -1010,7 +1010,7 @@ def latency_experiment_summary(suite_name, args, model, timings, **kwargs): # Hypothetically you can use this from other places, but it's currently # inaccessible, and when this assert fails you need to update the # event_name here to account for the other cases you are using this - assert args.quantization is not None + assert any([args.quantization, args.optimus]) output_signpost( dict(zip(headers, row)), args, @@ -2288,11 +2288,9 @@ def record_status(accuracy_status, dynamo_start_stats): ) ): is_same = False - except Exception as e: + except Exception: # Sometimes torch.allclose may throw RuntimeError - exception_string = str(e) - accuracy_status = f"fail_exception: {exception_string}" - return record_status(accuracy_status, dynamo_start_stats=start_stats) + is_same = False if not is_same: accuracy_status = "eager_two_runs_differ" @@ -2381,7 +2379,9 @@ def record_status(accuracy_status, dynamo_start_stats): print( f"Load model outputs from {self.args.compare_model_outputs_with} to compare" ) - saved_result = torch.load(self.args.compare_model_outputs_with) + saved_result = torch.load( + self.args.compare_model_outputs_with, weights_only=False + ) is_bitwise_same = bitwise_same(saved_result, new_result) if not is_bitwise_same: print( @@ -2409,11 +2409,9 @@ def record_status(accuracy_status, dynamo_start_stats): force_max_multiplier=force_max_multiplier, ): is_same = False - except Exception as e: + except Exception: # Sometimes torch.allclose may throw RuntimeError - exception_string = str(e) - accuracy_status = f"fail_exception: {exception_string}" - return record_status(accuracy_status, dynamo_start_stats=start_stats) + is_same = False if not is_same: if self.args.skip_accuracy_check: @@ -2587,6 +2585,9 @@ def warmup(fn, model, example_inputs, mode, niters=10): **experiment_kwargs, ) + # reset dynamo + torch._dynamo.reset() + if self.args.export_aot_inductor: optimized_model_iter_fn = optimize_ctx else: @@ -2950,7 +2951,7 @@ def run_one_model( status = self.check_tolerance(name, model, example_inputs, optimize_ctx) print(status) elif self.args.performance: - if self.args.backend == "torchao": + if self.args.backend in ["torchao", "optimus"]: status = self.run_performance_test_non_alternate( name, model, example_inputs, optimize_ctx, experiment, tag ) @@ -3526,6 +3527,12 @@ def get_example_inputs(self): action="store_true", help="Measure speedup with TorchInductor", ) + group.add_argument( + "--optimus", + choices=["vertical_opt", "horizontal_opt", "all"], + default=None, + help="Measure speedup of Optimus with TorchInductor baseline", + ) group.add_argument( "--quantization", choices=[ @@ -3783,6 +3790,9 @@ def run(runner, args, original_dir=None): if args.inductor: assert args.backend is None args.backend = "inductor" + if args.optimus: + assert args.backend is None + args.backend = "optimus" if args.quantization: assert args.backend is None args.backend = "torchao" @@ -4067,10 +4077,22 @@ def model_iter_fn_and_mark_step(*args, **kwargs): runner.model_iter_fn = model_iter_fn_and_mark_step optimize_ctx = torchao_optimize_ctx(args.quantization) + elif args.backend == "optimus": + from .optimus import get_baseline_ctx, get_optimus_optimize_ctx + + baseline_ctx = get_baseline_ctx( + nopython=args.nopython, inductor_compile_mode=args.inductor_compile_mode + ) + runner.model_iter_fn = baseline_ctx(runner.model_iter_fn) + optimize_ctx = get_optimus_optimize_ctx( + args.optimus, args.nopython, args.inductor_compile_mode + ) else: optimize_ctx = torch._dynamo.optimize(args.backend, nopython=args.nopython) experiment = ( - speedup_experiment if args.backend != "torchao" else latency_experiment + speedup_experiment + if args.backend not in ["torchao", "optimus"] + else latency_experiment ) if args.accuracy: output_filename = f"accuracy_{args.backend}.csv" @@ -4091,7 +4113,12 @@ def model_iter_fn_and_mark_step(*args, **kwargs): if args.only in runner.disable_cudagraph_models: args.disable_cudagraphs = True - if args.inductor or args.backend == "inductor" or args.export_aot_inductor: + if ( + args.inductor + or args.backend == "inductor" + or args.export_aot_inductor + or args.backend == "optimus" + ): inductor_config.triton.cudagraphs = not args.disable_cudagraphs inductor_config.triton.persistent_reductions = ( not args.disable_persistent_reductions diff --git a/benchmarks/dynamo/optimus.py b/benchmarks/dynamo/optimus.py new file mode 100644 index 0000000000000..f188b698edd5f --- /dev/null +++ b/benchmarks/dynamo/optimus.py @@ -0,0 +1,62 @@ +import functools + +import torch + + +def get_baseline_ctx(nopython, inductor_compile_mode): + return functools.partial( + torch.compile, + backend="inductor", + fullgraph=nopython, + mode=inductor_compile_mode, + ) + + +def get_optimus_optimize_ctx(config, nopython, inductor_compile_mode): + if config == "vertical_opt": + optimus_inductor_config = { + "pre_grad_fusion_options": { + "normalization_pass": {}, + "merge_splits_pass": {}, + "split_cat_pass": {}, + "unbind_stack_pass": {}, + "unbind_cat_to_view_pass": {}, + } + } + elif config == "horizontal_opt": + optimus_inductor_config = { + "pre_grad_fusion_options": { + "normalization_pass": {}, + "batch_linear": {}, + "batch_layernorm": {}, + }, + } + elif config == "all": + optimus_inductor_config = { + "pre_grad_fusion_options": { + "normalization_pass": {}, + "batch_linear": {}, + "batch_layernorm": {}, + "merge_splits_pass": {}, + "split_cat_pass": {}, + "unbind_stack_pass": {}, + "unbind_cat_to_view_pass": {}, + }, + } + else: + raise RuntimeError(f"Unknown optimus config: {config}") + + def _inner(fn): + if "pre_grad_fusion_options" in optimus_inductor_config: + torch._inductor.config.pre_grad_fusion_options = optimus_inductor_config[ + "pre_grad_fusion_options" + ] + if "post_grad_fusion_options" in optimus_inductor_config: + torch._inductor.config.post_grad_fusion_options = optimus_inductor_config[ + "post_grad_fusion_options" + ] + return torch.compile( + fn, backend="inductor", fullgraph=nopython, mode=inductor_compile_mode + ) + + return _inner diff --git a/benchmarks/dynamo/parse_logs.py b/benchmarks/dynamo/parse_logs.py index 8704fda9b997a..a3def611bbcc2 100644 --- a/benchmarks/dynamo/parse_logs.py +++ b/benchmarks/dynamo/parse_logs.py @@ -2,6 +2,7 @@ import os import re import sys +from pathlib import Path # This script takes the logs produced by the benchmark scripts (e.g., @@ -15,8 +16,7 @@ # This script is not very well written, feel free to rewrite it as necessary assert len(sys.argv) == 2 - -full_log = open(sys.argv[1]).read() +full_log = Path(sys.argv[1]).read_text() # If the log contains a gist URL, extract it so we can include it in the CSV gist_url = "" diff --git a/benchmarks/dynamo/pr_time_benchmarks/benchmarks/dtensor.py b/benchmarks/dynamo/pr_time_benchmarks/benchmarks/dtensor.py new file mode 100644 index 0000000000000..db59dfacb3f82 --- /dev/null +++ b/benchmarks/dynamo/pr_time_benchmarks/benchmarks/dtensor.py @@ -0,0 +1,62 @@ +import sys + +from benchmark_base import BenchmarkBase + +import torch +from torch.distributed._tensor import DTensor, Replicate +from torch.testing._internal.distributed.fake_pg import FakeStore + + +class BenchmarkDTensorDispatch(BenchmarkBase): + def __init__(self, operator, world_size) -> None: + super().__init__( + category=f"dtensor_dispatch_{operator}", + device="cuda", + ) + self.world_size = world_size + + def name(self) -> str: + prefix = f"{self.category()}" + return prefix + + def description(self) -> str: + return f"DTensor dispatch time for {self.category()}" + + def _prepare_once(self) -> None: + self.mesh = torch.distributed.device_mesh.init_device_mesh( + "cuda", (self.world_size,), mesh_dim_names=("dp",) + ) + self.a = DTensor.from_local( + torch.ones(10, 10, device=self.device()), self.mesh, [Replicate()] + ) + self.b = DTensor.from_local( + torch.ones(10, 10, device=self.device()), self.mesh, [Replicate()] + ) + + def _prepare(self) -> None: + pass + + +class BenchmarkDetach(BenchmarkDTensorDispatch): + def __init__(self, world_size) -> None: + super().__init__(operator="detach", world_size=world_size) + + def _work(self) -> None: + self.a.detach() + + +def main(): + world_size = 256 + fake_store = FakeStore() + torch.distributed.init_process_group( + "fake", store=fake_store, rank=0, world_size=world_size + ) + result_path = sys.argv[1] + BenchmarkDetach(world_size).enable_instruction_count().collect_all().append_results( + result_path + ) + torch.distributed.destroy_process_group() + + +if __name__ == "__main__": + main() diff --git a/benchmarks/dynamo/torchbench.yaml b/benchmarks/dynamo/torchbench.yaml index b31a85ae26763..974c3d700a045 100644 --- a/benchmarks/dynamo/torchbench.yaml +++ b/benchmarks/dynamo/torchbench.yaml @@ -189,6 +189,10 @@ skip: - hf_Whisper - hf_distil_whisper - timm_vision_transformer_large + # https://github.com/pytorch/pytorch/issues/167895 + - stable_diffusion + - stable_diffusion_text_encoder + - stable_diffusion_unet device: cpu: diff --git a/benchmarks/operator_benchmark/aarch64_expected_ci_operator_benchmark_eager_float32_cpu.csv b/benchmarks/operator_benchmark/aarch64_expected_ci_operator_benchmark_eager_float32_cpu.csv index dc8b240ce570f..f3d8c7e65af04 100644 --- a/benchmarks/operator_benchmark/aarch64_expected_ci_operator_benchmark_eager_float32_cpu.csv +++ b/benchmarks/operator_benchmark/aarch64_expected_ci_operator_benchmark_eager_float32_cpu.csv @@ -484,24 +484,106 @@ PyTorch,sum,sum_R256_V512_dim0_contiguousTrue_cpu,short,False,50.954394,0.000000 PyTorch,sum,sum_R256_V512_dim0_contiguousFalse_cpu,short,False,57.957757,0.000000 PyTorch,sum,sum_R256_V512_dim1_contiguousTrue_cpu,short,False,53.592068,0.000000 PyTorch,sum,sum_R256_V512_dim1_contiguousFalse_cpu,short,False,51.339726,0.000000 -PyTorch,FloatToHalfTensorConversionBenchmark,FloatToHalfTensorConversionBenchmark_M8_N16_cpu,short,False,7.040985,0.000000 -PyTorch,FloatToHalfTensorConversionBenchmark,FloatToHalfTensorConversionBenchmark_M8_N64_cpu,short,False,7.168604,0.000000 -PyTorch,FloatToHalfTensorConversionBenchmark,FloatToHalfTensorConversionBenchmark_M8_N128_cpu,short,False,7.434442,0.000000 -PyTorch,FloatToHalfTensorConversionBenchmark,FloatToHalfTensorConversionBenchmark_M16_N16_cpu,short,False,7.078318,0.000000 -PyTorch,FloatToHalfTensorConversionBenchmark,FloatToHalfTensorConversionBenchmark_M16_N64_cpu,short,False,7.426670,0.000000 -PyTorch,FloatToHalfTensorConversionBenchmark,FloatToHalfTensorConversionBenchmark_M16_N128_cpu,short,False,7.679027,0.000000 -PyTorch,FloatToHalfTensorConversionBenchmark,FloatToHalfTensorConversionBenchmark_M32_N16_cpu,short,False,7.281365,0.000000 -PyTorch,FloatToHalfTensorConversionBenchmark,FloatToHalfTensorConversionBenchmark_M32_N64_cpu,short,False,7.682783,0.000000 -PyTorch,FloatToHalfTensorConversionBenchmark,FloatToHalfTensorConversionBenchmark_M32_N128_cpu,short,False,8.381938,0.000000 -PyTorch,HalfToFloatTensorConversionBenchmark,HalfToFloatTensorConversionBenchmark_M8_N16_cpu,short,False,7.039854,0.000000 -PyTorch,HalfToFloatTensorConversionBenchmark,HalfToFloatTensorConversionBenchmark_M8_N64_cpu,short,False,7.399855,0.000000 -PyTorch,HalfToFloatTensorConversionBenchmark,HalfToFloatTensorConversionBenchmark_M8_N128_cpu,short,False,7.715193,0.000000 -PyTorch,HalfToFloatTensorConversionBenchmark,HalfToFloatTensorConversionBenchmark_M16_N16_cpu,short,False,7.255140,0.000000 -PyTorch,HalfToFloatTensorConversionBenchmark,HalfToFloatTensorConversionBenchmark_M16_N64_cpu,short,False,7.753522,0.000000 -PyTorch,HalfToFloatTensorConversionBenchmark,HalfToFloatTensorConversionBenchmark_M16_N128_cpu,short,False,8.364281,0.000000 -PyTorch,HalfToFloatTensorConversionBenchmark,HalfToFloatTensorConversionBenchmark_M32_N16_cpu,short,False,7.476377,0.000000 -PyTorch,HalfToFloatTensorConversionBenchmark,HalfToFloatTensorConversionBenchmark_M32_N64_cpu,short,False,8.458564,0.000000 -PyTorch,HalfToFloatTensorConversionBenchmark,HalfToFloatTensorConversionBenchmark_M32_N128_cpu,short,False,9.391939,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bool_dtype_twotorch.bool,short,False,0.927,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bool_dtype_twotorch.uint8,short,False,6.261,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bool_dtype_twotorch.int8,short,False,6.351,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bool_dtype_twotorch.int16,short,False,6.177,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bool_dtype_twotorch.int32,short,False,6.333,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bool_dtype_twotorch.int64,short,False,6.588,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bool_dtype_twotorch.float16,short,False,8.117,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bool_dtype_twotorch.bfloat16,short,False,9.358,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bool_dtype_twotorch.float32,short,False,7.844,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bool_dtype_twotorch.float64,short,False,8.097,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.uint8_dtype_twotorch.bool,short,False,6.159,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.uint8_dtype_twotorch.uint8,short,False,0.926,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.uint8_dtype_twotorch.int8,short,False,6.192,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.uint8_dtype_twotorch.int16,short,False,6.276,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.uint8_dtype_twotorch.int32,short,False,6.461,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.uint8_dtype_twotorch.int64,short,False,6.524,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.uint8_dtype_twotorch.float16,short,False,8.136,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.uint8_dtype_twotorch.bfloat16,short,False,6.854,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.uint8_dtype_twotorch.float32,short,False,6.446,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.uint8_dtype_twotorch.float64,short,False,6.829,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int8_dtype_twotorch.bool,short,False,6.088,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int8_dtype_twotorch.uint8,short,False,6.059,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int8_dtype_twotorch.int8,short,False,0.922,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int8_dtype_twotorch.int16,short,False,6.263,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int8_dtype_twotorch.int32,short,False,6.330,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int8_dtype_twotorch.int64,short,False,6.688,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int8_dtype_twotorch.float16,short,False,8.176,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int8_dtype_twotorch.bfloat16,short,False,6.959,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int8_dtype_twotorch.float32,short,False,6.430,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int8_dtype_twotorch.float64,short,False,6.818,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int16_dtype_twotorch.bool,short,False,6.350,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int16_dtype_twotorch.uint8,short,False,6.221,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int16_dtype_twotorch.int8,short,False,6.193,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int16_dtype_twotorch.int16,short,False,0.922,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int16_dtype_twotorch.int32,short,False,6.263,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int16_dtype_twotorch.int64,short,False,6.525,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int16_dtype_twotorch.float16,short,False,7.960,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int16_dtype_twotorch.bfloat16,short,False,6.801,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int16_dtype_twotorch.float32,short,False,6.594,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int16_dtype_twotorch.float64,short,False,7.089,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int32_dtype_twotorch.bool,short,False,6.498,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int32_dtype_twotorch.uint8,short,False,6.358,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int32_dtype_twotorch.int8,short,False,6.390,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int32_dtype_twotorch.int16,short,False,6.415,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int32_dtype_twotorch.int32,short,False,0.925,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int32_dtype_twotorch.int64,short,False,6.657,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int32_dtype_twotorch.float16,short,False,7.954,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int32_dtype_twotorch.bfloat16,short,False,6.930,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int32_dtype_twotorch.float32,short,False,6.737,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int32_dtype_twotorch.float64,short,False,6.948,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int64_dtype_twotorch.bool,short,False,6.757,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int64_dtype_twotorch.uint8,short,False,6.402,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int64_dtype_twotorch.int8,short,False,6.550,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int64_dtype_twotorch.int16,short,False,6.518,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int64_dtype_twotorch.int32,short,False,6.766,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int64_dtype_twotorch.int64,short,False,0.929,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int64_dtype_twotorch.float16,short,False,8.557,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int64_dtype_twotorch.bfloat16,short,False,9.045,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int64_dtype_twotorch.float32,short,False,7.672,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int64_dtype_twotorch.float64,short,False,7.276,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float16_dtype_twotorch.bool,short,False,6.414,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float16_dtype_twotorch.uint8,short,False,7.736,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float16_dtype_twotorch.int8,short,False,7.889,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float16_dtype_twotorch.int16,short,False,8.170,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float16_dtype_twotorch.int32,short,False,7.783,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float16_dtype_twotorch.int64,short,False,7.743,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float16_dtype_twotorch.float16,short,False,0.927,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float16_dtype_twotorch.bfloat16,short,False,7.018,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float16_dtype_twotorch.float32,short,False,8.428,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float16_dtype_twotorch.float64,short,False,6.767,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bfloat16_dtype_twotorch.bool,short,False,6.479,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bfloat16_dtype_twotorch.uint8,short,False,7.827,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bfloat16_dtype_twotorch.int8,short,False,6.450,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bfloat16_dtype_twotorch.int16,short,False,6.320,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bfloat16_dtype_twotorch.int32,short,False,6.385,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bfloat16_dtype_twotorch.int64,short,False,8.119,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bfloat16_dtype_twotorch.float16,short,False,8.063,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bfloat16_dtype_twotorch.bfloat16,short,False,0.925,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bfloat16_dtype_twotorch.float32,short,False,8.629,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bfloat16_dtype_twotorch.float64,short,False,6.638,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float32_dtype_twotorch.bool,short,False,6.425,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float32_dtype_twotorch.uint8,short,False,7.803,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float32_dtype_twotorch.int8,short,False,6.502,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float32_dtype_twotorch.int16,short,False,6.429,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float32_dtype_twotorch.int32,short,False,6.549,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float32_dtype_twotorch.int64,short,False,7.749,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float32_dtype_twotorch.float16,short,False,7.301,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float32_dtype_twotorch.bfloat16,short,False,7.682,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float32_dtype_twotorch.float32,short,False,0.930,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float32_dtype_twotorch.float64,short,False,6.738,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float64_dtype_twotorch.bool,short,False,6.798,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float64_dtype_twotorch.uint8,short,False,6.506,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float64_dtype_twotorch.int8,short,False,6.494,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float64_dtype_twotorch.int16,short,False,6.668,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float64_dtype_twotorch.int32,short,False,6.696,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float64_dtype_twotorch.int64,short,False,7.115,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float64_dtype_twotorch.float16,short,False,7.910,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float64_dtype_twotorch.bfloat16,short,False,7.410,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float64_dtype_twotorch.float32,short,False,6.868,0.000000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float64_dtype_twotorch.float64,short,False,0.924,0.000000 PyTorch,addcmul,addcmul_M1_N2_cpu_dtypetorch.float32,short,False,4.461410,0.000000 PyTorch,addcmul,addcmul_M1_N2_cpu_dtypetorch.bfloat16,short,False,4.560082,0.000000 PyTorch,addcmul,addcmul_M32_N64_cpu_dtypetorch.float32,short,False,5.141248,0.000000 diff --git a/benchmarks/operator_benchmark/pt/addmm_test.py b/benchmarks/operator_benchmark/pt/addmm_test.py index a98628944b3e8..3e94a9cd7f3dc 100644 --- a/benchmarks/operator_benchmark/pt/addmm_test.py +++ b/benchmarks/operator_benchmark/pt/addmm_test.py @@ -53,10 +53,8 @@ def forward(self, input_one, mat1, mat2): return torch.addmm(input_one, mat1, mat2) -op_bench.generate_pt_test(addmm_long_configs + addmm_long_configs, AddmmBenchmark) -op_bench.generate_pt_gradient_test( - addmm_long_configs + addmm_long_configs, AddmmBenchmark -) +op_bench.generate_pt_test(addmm_short_configs + addmm_long_configs, AddmmBenchmark) +op_bench.generate_pt_gradient_test(addmm_long_configs, AddmmBenchmark) """Mircobenchmark for addbmm operator.""" @@ -107,9 +105,7 @@ def forward(self, input_one, batch1, batch2): ) op_bench.generate_pt_test(addbmm_long_configs + addbmm_short_configs, AddbmmBenchmark) -op_bench.generate_pt_gradient_test( - addbmm_long_configs + addbmm_short_configs, AddbmmBenchmark -) +op_bench.generate_pt_gradient_test(addbmm_long_configs, AddbmmBenchmark) if __name__ == "__main__": op_bench.benchmark_runner.main() diff --git a/benchmarks/operator_benchmark/pt/tensor_to_test.py b/benchmarks/operator_benchmark/pt/tensor_to_test.py index 621e58212cba2..9354c8c52eaa8 100644 --- a/benchmarks/operator_benchmark/pt/tensor_to_test.py +++ b/benchmarks/operator_benchmark/pt/tensor_to_test.py @@ -4,74 +4,84 @@ tensor_conversion_short_configs = op_bench.cross_product_configs( - M=( - 8, - 16, - 32, - ), - N=( - 16, - 64, - 128, - ), + M=[32], + N=[128], device=["cpu", "cuda"], + dtype_one=[ + torch.bool, + torch.uint8, + torch.int8, + torch.int16, + torch.int32, + torch.int64, + torch.half, + torch.bfloat16, + torch.float, + torch.double, + ], + dtype_two=[ + torch.bool, + torch.uint8, + torch.int8, + torch.int16, + torch.int32, + torch.int64, + torch.half, + torch.bfloat16, + torch.float, + torch.double, + ], tags=["short"], ) tensor_conversion_long_configs = op_bench.cross_product_configs( - M=( - 64, - 128, - 256, - 512, - ), - N=( - 256, - 512, - 1024, - 2048, - ), + M=[1024], + N=[1024], device=["cpu", "cuda"], + dtype_one=[ + torch.bool, + torch.uint8, + torch.int8, + torch.int16, + torch.int32, + torch.int64, + torch.half, + torch.bfloat16, + torch.float, + torch.double, + ], + dtype_two=[ + torch.bool, + torch.uint8, + torch.int8, + torch.int16, + torch.int32, + torch.int64, + torch.half, + torch.bfloat16, + torch.float, + torch.double, + ], tags=["long"], ) -class FloatToHalfTensorConversionBenchmark(op_bench.TorchBenchmarkBase): - def init(self, M, N, device): +class TensorConversionBenchmark(op_bench.TorchBenchmarkBase): + def init(self, M, N, dtype_one, dtype_two, device): self.inputs = { "input": torch.rand( M, N, device=device, requires_grad=False, dtype=torch.float - ) + ).to(dtype=dtype_one) } + self.dtype_one = dtype_one + self.dtype_two = dtype_two def forward(self, input): - return input.to(torch.half) + return input.to(dtype=self.dtype_two) -class HalfToFloatTensorConversionBenchmark(op_bench.TorchBenchmarkBase): - def init(self, M, N, device): - self.inputs = { - "input": torch.rand( - M, N, device=device, requires_grad=False, dtype=torch.half - ) - } - - def forward(self, input): - return input.to(torch.float) - - -op_bench.generate_pt_test( - tensor_conversion_short_configs, FloatToHalfTensorConversionBenchmark -) -op_bench.generate_pt_test( - tensor_conversion_long_configs, FloatToHalfTensorConversionBenchmark -) -op_bench.generate_pt_test( - tensor_conversion_short_configs, HalfToFloatTensorConversionBenchmark -) -op_bench.generate_pt_test( - tensor_conversion_long_configs, HalfToFloatTensorConversionBenchmark -) +op_bench.generate_pt_test(tensor_conversion_short_configs, TensorConversionBenchmark) +op_bench.generate_pt_test(tensor_conversion_long_configs, TensorConversionBenchmark) if __name__ == "__main__": op_bench.benchmark_runner.main() diff --git a/benchmarks/operator_benchmark/x86_64_expected_ci_operator_benchmark_eager_float32_cpu.csv b/benchmarks/operator_benchmark/x86_64_expected_ci_operator_benchmark_eager_float32_cpu.csv index d7a8e65aa85af..71a5930a01a3f 100644 --- a/benchmarks/operator_benchmark/x86_64_expected_ci_operator_benchmark_eager_float32_cpu.csv +++ b/benchmarks/operator_benchmark/x86_64_expected_ci_operator_benchmark_eager_float32_cpu.csv @@ -349,24 +349,106 @@ PyTorch,sum,sum_R256_V512_dim0_contiguousTrue_cpu,short,FALSE,12.5841 PyTorch,sum,sum_R256_V512_dim0_contiguousFALSE_cpu,short,FALSE,20.8765 PyTorch,sum,sum_R256_V512_dim1_contiguousTrue_cpu,short,FALSE,15.4414 PyTorch,sum,sum_R256_V512_dim1_contiguousFALSE_cpu,short,FALSE,15.3287 -PyTorch,FloatToHalfTensorConversionBenchmark,FloatToHalfTensorConversionBenchmark_M8_N16_cpu,short,FALSE,5.0499 -PyTorch,FloatToHalfTensorConversionBenchmark,FloatToHalfTensorConversionBenchmark_M8_N64_cpu,short,FALSE,5.3229 -PyTorch,FloatToHalfTensorConversionBenchmark,FloatToHalfTensorConversionBenchmark_M8_N128_cpu,short,FALSE,5.4418 -PyTorch,FloatToHalfTensorConversionBenchmark,FloatToHalfTensorConversionBenchmark_M16_N16_cpu,short,FALSE,5.0868 -PyTorch,FloatToHalfTensorConversionBenchmark,FloatToHalfTensorConversionBenchmark_M16_N64_cpu,short,FALSE,5.4495 -PyTorch,FloatToHalfTensorConversionBenchmark,FloatToHalfTensorConversionBenchmark_M16_N128_cpu,short,FALSE,5.5578 -PyTorch,FloatToHalfTensorConversionBenchmark,FloatToHalfTensorConversionBenchmark_M32_N16_cpu,short,FALSE,5.2631 -PyTorch,FloatToHalfTensorConversionBenchmark,FloatToHalfTensorConversionBenchmark_M32_N64_cpu,short,FALSE,5.5646 -PyTorch,FloatToHalfTensorConversionBenchmark,FloatToHalfTensorConversionBenchmark_M32_N128_cpu,short,FALSE,5.7898 -PyTorch,HalfToFloatTensorConversionBenchmark,HalfToFloatTensorConversionBenchmark_M8_N16_cpu,short,FALSE,5.0228 -PyTorch,HalfToFloatTensorConversionBenchmark,HalfToFloatTensorConversionBenchmark_M8_N64_cpu,short,FALSE,5.3692 -PyTorch,HalfToFloatTensorConversionBenchmark,HalfToFloatTensorConversionBenchmark_M8_N128_cpu,short,FALSE,5.4006 -PyTorch,HalfToFloatTensorConversionBenchmark,HalfToFloatTensorConversionBenchmark_M16_N16_cpu,short,FALSE,5.1107 -PyTorch,HalfToFloatTensorConversionBenchmark,HalfToFloatTensorConversionBenchmark_M16_N64_cpu,short,FALSE,5.4119 -PyTorch,HalfToFloatTensorConversionBenchmark,HalfToFloatTensorConversionBenchmark_M16_N128_cpu,short,FALSE,5.5583 -PyTorch,HalfToFloatTensorConversionBenchmark,HalfToFloatTensorConversionBenchmark_M32_N16_cpu,short,FALSE,5.3818 -PyTorch,HalfToFloatTensorConversionBenchmark,HalfToFloatTensorConversionBenchmark_M32_N64_cpu,short,FALSE,5.5742 -PyTorch,HalfToFloatTensorConversionBenchmark,HalfToFloatTensorConversionBenchmark_M32_N128_cpu,short,FALSE,6.8414 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bool_dtype_twotorch.bool,short,False,0.797 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bool_dtype_twotorch.uint8,short,False,6.071 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bool_dtype_twotorch.int8,short,False,6.031 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bool_dtype_twotorch.int16,short,False,6.243 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bool_dtype_twotorch.int32,short,False,7.231 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bool_dtype_twotorch.int64,short,False,7.791 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bool_dtype_twotorch.float16,short,False,12.661 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bool_dtype_twotorch.bfloat16,short,False,11.225 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bool_dtype_twotorch.float32,short,False,9.772 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bool_dtype_twotorch.float64,short,False,9.872 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.uint8_dtype_twotorch.bool,short,False,6.033 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.uint8_dtype_twotorch.uint8,short,False,0.781 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.uint8_dtype_twotorch.int8,short,False,6.060 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.uint8_dtype_twotorch.int16,short,False,6.180 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.uint8_dtype_twotorch.int32,short,False,7.258 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.uint8_dtype_twotorch.int64,short,False,7.758 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.uint8_dtype_twotorch.float16,short,False,10.504 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.uint8_dtype_twotorch.bfloat16,short,False,6.749 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.uint8_dtype_twotorch.float32,short,False,7.679 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.uint8_dtype_twotorch.float64,short,False,7.797 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int8_dtype_twotorch.bool,short,False,6.019 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int8_dtype_twotorch.uint8,short,False,6.079 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int8_dtype_twotorch.int8,short,False,0.785 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int8_dtype_twotorch.int16,short,False,6.188 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int8_dtype_twotorch.int32,short,False,7.288 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int8_dtype_twotorch.int64,short,False,7.770 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int8_dtype_twotorch.float16,short,False,10.466 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int8_dtype_twotorch.bfloat16,short,False,6.676 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int8_dtype_twotorch.float32,short,False,7.736 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int8_dtype_twotorch.float64,short,False,7.780 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int16_dtype_twotorch.bool,short,False,6.130 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int16_dtype_twotorch.uint8,short,False,6.221 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int16_dtype_twotorch.int8,short,False,6.101 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int16_dtype_twotorch.int16,short,False,0.791 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int16_dtype_twotorch.int32,short,False,6.254 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int16_dtype_twotorch.int64,short,False,7.733 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int16_dtype_twotorch.float16,short,False,10.562 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int16_dtype_twotorch.bfloat16,short,False,6.704 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int16_dtype_twotorch.float32,short,False,7.819 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int16_dtype_twotorch.float64,short,False,8.276 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int32_dtype_twotorch.bool,short,False,6.361 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int32_dtype_twotorch.uint8,short,False,6.364 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int32_dtype_twotorch.int8,short,False,6.309 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int32_dtype_twotorch.int16,short,False,6.362 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int32_dtype_twotorch.int32,short,False,0.791 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int32_dtype_twotorch.int64,short,False,7.746 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int32_dtype_twotorch.float16,short,False,9.462 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int32_dtype_twotorch.bfloat16,short,False,6.678 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int32_dtype_twotorch.float32,short,False,7.827 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int32_dtype_twotorch.float64,short,False,8.200 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int64_dtype_twotorch.bool,short,False,6.925 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int64_dtype_twotorch.uint8,short,False,6.947 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int64_dtype_twotorch.int8,short,False,6.962 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int64_dtype_twotorch.int16,short,False,6.906 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int64_dtype_twotorch.int32,short,False,7.664 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int64_dtype_twotorch.int64,short,False,0.782 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int64_dtype_twotorch.float16,short,False,10.528 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int64_dtype_twotorch.bfloat16,short,False,10.123 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int64_dtype_twotorch.float32,short,False,9.234 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int64_dtype_twotorch.float64,short,False,8.694 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float16_dtype_twotorch.bool,short,False,12.653 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float16_dtype_twotorch.uint8,short,False,9.348 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float16_dtype_twotorch.int8,short,False,8.774 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float16_dtype_twotorch.int16,short,False,9.063 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float16_dtype_twotorch.int32,short,False,10.012 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float16_dtype_twotorch.int64,short,False,13.641 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float16_dtype_twotorch.float16,short,False,0.788 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float16_dtype_twotorch.bfloat16,short,False,13.757 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float16_dtype_twotorch.float32,short,False,7.170 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float16_dtype_twotorch.float64,short,False,12.511 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bfloat16_dtype_twotorch.bool,short,False,6.516 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bfloat16_dtype_twotorch.uint8,short,False,8.539 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bfloat16_dtype_twotorch.int8,short,False,6.483 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bfloat16_dtype_twotorch.int16,short,False,6.468 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bfloat16_dtype_twotorch.int32,short,False,7.752 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bfloat16_dtype_twotorch.int64,short,False,9.868 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bfloat16_dtype_twotorch.float16,short,False,10.556 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bfloat16_dtype_twotorch.bfloat16,short,False,0.792 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bfloat16_dtype_twotorch.float32,short,False,7.577 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bfloat16_dtype_twotorch.float64,short,False,8.267 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float32_dtype_twotorch.bool,short,False,6.819 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float32_dtype_twotorch.uint8,short,False,7.715 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float32_dtype_twotorch.int8,short,False,6.754 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float32_dtype_twotorch.int16,short,False,6.825 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float32_dtype_twotorch.int32,short,False,7.790 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float32_dtype_twotorch.int64,short,False,9.219 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float32_dtype_twotorch.float16,short,False,5.977 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float32_dtype_twotorch.bfloat16,short,False,7.069 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float32_dtype_twotorch.float32,short,False,0.794 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float32_dtype_twotorch.float64,short,False,8.301 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float64_dtype_twotorch.bool,short,False,7.401 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float64_dtype_twotorch.uint8,short,False,7.843 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float64_dtype_twotorch.int8,short,False,7.117 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float64_dtype_twotorch.int16,short,False,7.170 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float64_dtype_twotorch.int32,short,False,8.000 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float64_dtype_twotorch.int64,short,False,9.284 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float64_dtype_twotorch.float16,short,False,7.179 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float64_dtype_twotorch.bfloat16,short,False,7.645 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float64_dtype_twotorch.float32,short,False,7.988 +PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float64_dtype_twotorch.float64,short,False,0.792 PyTorch,relu,"relu_dims(3,4,5)_contigFALSE_inplaceFALSE_dtypetorch.quint8",short,FALSE,9.4657 PyTorch,relu,"relu_dims(3,4,5)_contigFALSE_inplaceFALSE_dtypetorch.qint8",short,FALSE,9.4625 PyTorch,relu,"relu_dims(3,4,5)_contigFALSE_inplaceFALSE_dtypetorch.qint32",short,FALSE,9.4165 diff --git a/benchmarks/sparse/spmm.py b/benchmarks/sparse/spmm.py index b707556dd7a15..b2c658d6faeb6 100644 --- a/benchmarks/sparse/spmm.py +++ b/benchmarks/sparse/spmm.py @@ -52,19 +52,18 @@ def test_sparse_coo_and_csr(m, n, k, nnz, test_count): start.record() coo.matmul(mat) stop.record() - times.append(start.elapsed_time(stop)) - coo_mean_time = sum(times) / len(times) + coo_mean_time = sum(times) / len(times) - times = [] - for _ in range(test_count): - start.record() - csr.matmul(mat) - stop.record() - times.append(start.elapsed_time(stop)) + times = [] + for _ in range(test_count): + start.record() + csr.matmul(mat) + stop.record() + times.append(start.elapsed_time(stop)) - csr_mean_time = sum(times) / len(times) + csr_mean_time = sum(times) / len(times) return coo_mean_time, csr_mean_time @@ -84,10 +83,13 @@ def test_sparse_coo_and_csr(m, n, k, nnz, test_count): if args.outfile == "stdout": outfile = sys.stdout + need_close = False elif args.outfile == "stderr": outfile = sys.stderr + need_close = False else: outfile = open(args.outfile, "a") + need_close = True test_count = args.test_count m = args.m @@ -148,3 +150,5 @@ def test_sparse_coo_and_csr(m, n, k, nnz, test_count): time, file=outfile, ) + if need_close: + outfile.close() diff --git a/benchmarks/sparse/spmv.py b/benchmarks/sparse/spmv.py index f8900882ca4ec..3e9502686a884 100644 --- a/benchmarks/sparse/spmv.py +++ b/benchmarks/sparse/spmv.py @@ -82,10 +82,13 @@ def test_sparse_coo_and_csr(m, nnz, test_count): if args.outfile == "stdout": outfile = sys.stdout + need_close = False elif args.outfile == "stderr": outfile = sys.stderr + need_close = False else: outfile = open(args.outfile, "a") + need_close = True test_count = args.test_count m = args.m @@ -132,3 +135,5 @@ def test_sparse_coo_and_csr(m, nnz, test_count): time_csr, file=outfile, ) + if need_close: + outfile.close() diff --git a/benchmarks/sparse/triton_ops.py b/benchmarks/sparse/triton_ops.py index 48a88d592ea2c..a49a53bcd207c 100644 --- a/benchmarks/sparse/triton_ops.py +++ b/benchmarks/sparse/triton_ops.py @@ -179,10 +179,13 @@ def integer_or_float_list(a): if args.outfile == "stdout": outfile = sys.stdout + need_close = False elif args.outfile == "stderr": outfile = sys.stderr + need_close = False else: outfile = open(args.outfile, "a") + need_close = True ops = args.ops.split(",") @@ -434,3 +437,5 @@ def show_best_messages(best_messages=best_messages): if op not in {"bsr_scatter_mm6", "bsr_dense_mm_with_meta"}: # Break on operations that do not consume parameters break + if need_close: + outfile.close() diff --git a/benchmarks/transformer/score_mod.py b/benchmarks/transformer/score_mod.py index 928cbf27df5b1..e9af132df28a9 100644 --- a/benchmarks/transformer/score_mod.py +++ b/benchmarks/transformer/score_mod.py @@ -125,6 +125,17 @@ def wrapper(config, *args, **kwargs): ] DtypeString = Literal["bfloat16", "float16", "float32"] SpeedupType = Literal["fwd", "bwd"] +# Operator Name mapping +backend_to_operator_name = { + "math": "math attention kernel", + "efficient": "efficient attention kernel", + "cudnn": "cudnn attention kernel", + "fav2": "flash attention 2 kernel", + "fav3": "flash attention 3 kernel", + "fakv": "flash attention kv cache kernel", + "og-eager": "eager attention kernel", + "flex": "flex attention kernel", +} def benchmark_torch_function_in_microseconds(func: Callable, *args, **kwargs) -> float: @@ -1265,12 +1276,14 @@ class BenchmarkRecord: model: ModelInfo metric: MetricInfo + operator_name = backend_to_operator_name.get(backend, backend) + # Benchmark extra info benchmark_extra_info = { "input_config": input_config, "device": device, "arch": device_arch, - "operator_name": backend, + "operator_name": operator_name, "attn_type": config.attn_type, "shape": str(config.shape), "max_autotune": config.max_autotune, @@ -1288,7 +1301,7 @@ class BenchmarkRecord: type="attention-benchmark", origins=["pytorch"], extra_info={ - "operator_name": backend, + "operator_name": operator_name, "attn_type": config.attn_type, }, ), @@ -1315,7 +1328,7 @@ class BenchmarkRecord: type="attention-benchmark", origins=["pytorch"], extra_info={ - "operator_name": backend, + "operator_name": operator_name, }, ), metric=MetricInfo( @@ -1341,7 +1354,7 @@ class BenchmarkRecord: type="attention-benchmark", origins=["pytorch"], extra_info={ - "operator_name": backend, + "operator_name": operator_name, }, ), metric=MetricInfo( @@ -1371,7 +1384,7 @@ class BenchmarkRecord: type="attention-benchmark", origins=["pytorch"], extra_info={ - "operator_name": backend, + "operator_name": operator_name, }, ), metric=MetricInfo( diff --git a/buckbuild.bzl b/buckbuild.bzl index 4c1affd10e1bc..1d26485baca89 100644 --- a/buckbuild.bzl +++ b/buckbuild.bzl @@ -2,13 +2,14 @@ # These load paths point to different files in internal and OSS environment load("@bazel_skylib//lib:paths.bzl", "paths") +load("//tools/build_defs:cell_defs.bzl", "get_fbsource_cell") load("//tools/build_defs:fb_native_wrapper.bzl", "fb_native") load("//tools/build_defs:fb_xplat_cxx_library.bzl", "fb_xplat_cxx_library") load("//tools/build_defs:fb_xplat_genrule.bzl", "fb_xplat_genrule") load("//tools/build_defs/windows:windows_flag_map.bzl", "windows_convert_gcc_clang_flags") load("//tools/build_defs:fbsource_utils.bzl", "is_arvr_mode") load("//tools/build_defs:glob_defs.bzl", "subdir_glob") -load("//tools/build_defs:platform_defs.bzl", "APPLETVOS", "IOS", "MACOSX") +load("//tools/build_defs:platform_defs.bzl", "IOS", "MACOSX") load("//tools/build_defs:type_defs.bzl", "is_list", "is_string") load("//tools/build_defs/android:build_mode_defs.bzl", is_production_build_android = "is_production_build") load("//tools/build_defs/apple:build_mode_defs.bzl", is_production_build_ios = "is_production_build", is_profile_build_ios = "is_profile_build") @@ -590,6 +591,9 @@ def pt_operator_query_codegen( pt_allow_forced_schema_registration = True, compatible_with = [], apple_sdks = None): + if get_fbsource_cell() == "fbcode": + return + oplist_dir_name = name + "_pt_oplist" # @lint-ignore BUCKLINT @@ -865,6 +869,9 @@ def define_buck_targets( pt_xplat_cxx_library = fb_xplat_cxx_library, c2_fbandroid_xplat_compiler_flags = [], labels = []): + if get_fbsource_cell() == "fbcode": + return + # @lint-ignore BUCKLINT fb_native.filegroup( name = "metal_build_srcs", @@ -1090,7 +1097,7 @@ def define_buck_targets( srcs = [ "caffe2/core/common.cc", ], - apple_sdks = (IOS, MACOSX, APPLETVOS), + apple_sdks = (IOS, MACOSX), compiler_flags = get_pt_compiler_flags(), labels = labels, # @lint-ignore BUCKLINT link_whole diff --git a/build_variables.bzl b/build_variables.bzl index 70121e19d8099..258e739300c1e 100644 --- a/build_variables.bzl +++ b/build_variables.bzl @@ -1025,6 +1025,7 @@ libtorch_python_core_sources = [ libtorch_python_distributed_core_sources = [ "torch/csrc/distributed/c10d/init.cpp", "torch/csrc/distributed/c10d/python_comm_hook.cpp", + "torch/csrc/distributed/c10d/python_callback_work.cpp", ] libtorch_python_distributed_sources = libtorch_python_distributed_core_sources + [ diff --git a/c10/core/Allocator.h b/c10/core/Allocator.h index 747b73da01352..7d2c814fe84f7 100644 --- a/c10/core/Allocator.h +++ b/c10/core/Allocator.h @@ -19,6 +19,17 @@ namespace c10 { +using CaptureId_t = unsigned long long; +// first is set if the instance is created by CUDAGraph::capture_begin. +// second is set if the instance is created by at::cuda::graph_pool_handle. +using MempoolId_t = std::pair; + +struct MempoolIdHash { + std::size_t operator()(const MempoolId_t& mempool_id) const noexcept { + return mempool_id.first != 0 ? mempool_id.first : mempool_id.second; + } +}; + // A DataPtr is a unique pointer (with an attached deleter and some // context for the deleter) to some memory, which also records what // device is for its data. diff --git a/c10/core/AutogradState.h b/c10/core/AutogradState.h index ad168b8c05987..d2b9cc080413d 100644 --- a/c10/core/AutogradState.h +++ b/c10/core/AutogradState.h @@ -1,6 +1,8 @@ #pragma once +#include #include +#include namespace c10 { @@ -15,7 +17,8 @@ struct C10_API AutogradState { bool inference_mode, bool fw_grad_mode, bool multithreading_enabled) - : grad_mode_(grad_mode), + : graph_exec_group_(std::nullopt), + grad_mode_(grad_mode), inference_mode_(inference_mode), fw_grad_mode_(fw_grad_mode), multithreading_enabled_(multithreading_enabled), @@ -41,6 +44,10 @@ struct C10_API AutogradState { view_replay_enabled_ = view_replay_enabled; } + void set_graph_exec_group(std::optional group) { + graph_exec_group_ = std::move(group); + } + bool get_grad_mode() const { return grad_mode_; } @@ -61,7 +68,12 @@ struct C10_API AutogradState { return view_replay_enabled_; } + const std::optional& get_graph_exec_group() const { + return graph_exec_group_; + } + private: + std::optional graph_exec_group_; bool grad_mode_ : 1; bool inference_mode_ : 1; bool fw_grad_mode_ : 1; diff --git a/c10/core/CachingDeviceAllocator.h b/c10/core/CachingDeviceAllocator.h index 0bec03ae417fa..c95d0714ce3bd 100644 --- a/c10/core/CachingDeviceAllocator.h +++ b/c10/core/CachingDeviceAllocator.h @@ -96,6 +96,13 @@ struct C10_API DeviceAllocator : public c10::Allocator { // Resets peak memory usage statistics for the specified device virtual void resetPeakStats(c10::DeviceIndex device) = 0; + + // Return the free memory size and total memory size in bytes for the + // specified device. + virtual std::pair getMemoryInfo(c10::DeviceIndex device) { + TORCH_CHECK_NOT_IMPLEMENTED( + false, "getMemoryInfo is not implemented for this allocator yet."); + } }; // This function is used to get the DeviceAllocator for a specific device type diff --git a/c10/core/DispatchKeySet.cpp b/c10/core/DispatchKeySet.cpp index 72e72f49a5e40..d1ec51b6a47d6 100644 --- a/c10/core/DispatchKeySet.cpp +++ b/c10/core/DispatchKeySet.cpp @@ -59,6 +59,9 @@ constexpr DispatchKeySet nested_dispatch_keyset = {DispatchKey::AutogradNestedTensor, DispatchKey::NestedTensor}) | DispatchKeySet(DispatchKeySet::RAW, full_backend_mask); +constexpr DispatchKeySet functorch_batched_dispatch_keyset = + DispatchKeySet(DispatchKey::FuncTorchBatched); + DispatchKeySet getRuntimeDispatchKeySet(DispatchKey t) { TORCH_INTERNAL_ASSERT(t != DispatchKey::Undefined); switch (t) { @@ -77,6 +80,8 @@ DispatchKeySet getRuntimeDispatchKeySet(DispatchKey t) { return backend_dispatch_keyset; case DispatchKey::CompositeExplicitAutogradNonFunctional: return non_functional_backend_dispatch_keyset; + case DispatchKey::FuncTorchBatchedDecomposition: + return functorch_batched_dispatch_keyset; default: return DispatchKeySet(t); } @@ -171,7 +176,7 @@ std::ostream& operator<<(std::ostream& os, DispatchKeySet ts) { os << k; first = false; } - os << ")"; + os << ')'; return os; } diff --git a/c10/core/SafePyObject.h b/c10/core/SafePyObject.h index 1ec0cdb6751e9..bcace0ac358b4 100644 --- a/c10/core/SafePyObject.h +++ b/c10/core/SafePyObject.h @@ -44,7 +44,7 @@ struct C10_API SafePyObject { (*other.pyinterpreter_)->incref(other.data_); } if (data_ != nullptr) { - (*pyinterpreter_)->decref(data_, /*has_pyobj_slot*/ false); + (*pyinterpreter_)->decref(data_); } data_ = other.data_; pyinterpreter_ = other.pyinterpreter_; @@ -53,7 +53,7 @@ struct C10_API SafePyObject { ~SafePyObject() { if (data_ != nullptr) { - (*pyinterpreter_)->decref(data_, /*has_pyobj_slot*/ false); + (*pyinterpreter_)->decref(data_); } } diff --git a/c10/core/ScalarType.h b/c10/core/ScalarType.h index ba1068e72695c..040c6abb7d8e2 100644 --- a/c10/core/ScalarType.h +++ b/c10/core/ScalarType.h @@ -27,26 +27,13 @@ #include C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wswitch-enum") +C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wswitch-default") namespace c10 { // See [dtype Macros note] in torch/headeronly/core/ScalarType.h // regarding macros. -template -struct CppTypeToScalarType; - -#define SPECIALIZE_CppTypeToScalarType(cpp_type, scalar_type) \ - template <> \ - struct CppTypeToScalarType \ - : std:: \ - integral_constant { \ - }; - -AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_AND_QINTS(SPECIALIZE_CppTypeToScalarType) - -#undef SPECIALIZE_CppTypeToScalarType - #define DEFINE_CONSTANT(_, name) \ constexpr ScalarType k##name = ScalarType::name; @@ -105,13 +92,6 @@ inline bool isComplexType(ScalarType t) { t == ScalarType::ComplexDouble); } -inline bool isQIntType(ScalarType t) { - // Don't forget to extend this when adding new QInt types - return t == ScalarType::QInt8 || t == ScalarType::QUInt8 || - t == ScalarType::QInt32 || t == ScalarType::QUInt4x2 || - t == ScalarType::QUInt2x4; -} - inline bool isBitsType(ScalarType t) { return t == ScalarType::Bits1x8 || t == ScalarType::Bits2x4 || t == ScalarType::Bits4x2 || t == ScalarType::Bits8 || @@ -205,6 +185,12 @@ inline bool isSignedType(ScalarType t) { break; // Do not add default here, but rather define behavior of every new entry // here. `-Wswitch-enum` would raise a warning in those cases. + // TODO: get PyTorch to adopt exhaustive switches by default with a way to + // opt specific switches to being non-exhaustive. + // Exhaustive: + // `-Wswitch-enum`, `-Wswitch-default`, `-Wno-covered-switch-default` + // Non-Exhaustive: + // `-Wno-switch-enum`, `-Wswitch-default`, `-Wcovered-switch-default` } TORCH_CHECK(false, "Unknown ScalarType ", t); #undef CASE_ISSIGNED diff --git a/c10/core/StorageImpl.cpp b/c10/core/StorageImpl.cpp index a614fc9234c94..00fc03bbd0fcf 100644 --- a/c10/core/StorageImpl.cpp +++ b/c10/core/StorageImpl.cpp @@ -48,6 +48,30 @@ void warnDeprecatedDataPtr() { TORCH_CHECK(false, "Cannot access data pointer of Storage that is invalid."); } +void StorageImpl::incref_pyobject() const { + // Because intrusive_ptr incref uses relaxed memory order, we need to + // do an acquire fence to ensure that the kHasPyObject bit was + // observed before the load of the PyObject* below. + // NB: This is a no-op on x86/x86-64 + std::atomic_thread_fence(std::memory_order_acquire); + + PyObject* obj = pyobj_slot_.load_pyobj(); + (*pyobj_slot_.pyobj_interpreter())->incref(obj); +} + +void StorageImpl::decref_pyobject() const { + PyObject* obj = pyobj_slot_.load_pyobj(); + (*pyobj_slot_.pyobj_interpreter())->decref(obj); +} + +bool StorageImpl::try_incref_pyobject() const { + c10::impl::PyInterpreter* interp = pyobj_slot_.pyobj_interpreter(); + if (C10_UNLIKELY(!interp)) { + return false; + } + return (*interp)->try_incref(pyobj_slot_); +} + void SetStorageImplCreate(DeviceType t, StorageImplCreateHelper fptr) { // Allowlist verification. // Only if the devicetype is in the allowlist, diff --git a/c10/core/StorageImpl.h b/c10/core/StorageImpl.h index f34a1baed7a48..c7dbd5c1f005b 100644 --- a/c10/core/StorageImpl.h +++ b/c10/core/StorageImpl.h @@ -105,6 +105,12 @@ struct C10_API StorageImpl : public c10::intrusive_ptr_target { data_ptr_.clear(); } + void incref_pyobject() const override final; + + void decref_pyobject() const override final; + + bool try_incref_pyobject() const override final; + size_t nbytes() const { // OK to do this instead of maybe_as_int as nbytes is guaranteed positive TORCH_CHECK(!size_bytes_is_heap_allocated_); @@ -370,4 +376,18 @@ C10_API c10::intrusive_ptr make_storage_impl( bool resizable, std::optional device_opt); +namespace detail { + +#ifndef C10_MOBILE +template +struct TargetTraits< + T, + std::enable_if_t< + std::is_base_of_v>>> { + static constexpr bool can_have_pyobject = true; +}; +#endif + +} // namespace detail + } // namespace c10 diff --git a/c10/core/SymBool.cpp b/c10/core/SymBool.cpp index d804eb9d27409..48c407b8b069c 100644 --- a/c10/core/SymBool.cpp +++ b/c10/core/SymBool.cpp @@ -1,4 +1,5 @@ #include +#include #include namespace c10 { @@ -111,4 +112,17 @@ bool SymBool::has_hint() const { return toSymNodeImpl()->has_hint(); } +SymInt SymBool::toSymInt() const { + // If concrete bool, return concrete SymInt + if (auto ma = maybe_as_bool()) { + return SymInt(*ma ? 1 : 0); + } + + // Symbolic case: use sym_ite to convert bool to int (0 or 1) + auto node = toSymNodeImpl(); + auto one_node = node->wrap_int(1); + auto zero_node = node->wrap_int(0); + return SymInt(node->sym_ite(one_node, zero_node)); +} + } // namespace c10 diff --git a/c10/core/SymBool.h b/c10/core/SymBool.h index d5d509e239b1d..a27a28a5bf8a3 100644 --- a/c10/core/SymBool.h +++ b/c10/core/SymBool.h @@ -12,6 +12,8 @@ namespace c10 { +class SymInt; + class C10_API SymBool { public: /*implicit*/ SymBool(bool b) : data_(b) {} @@ -80,6 +82,10 @@ class C10_API SymBool { return toSymNodeImplUnowned()->constant_bool(); } + // Convert SymBool to SymInt (0 or 1) + // This is the C++ equivalent of Python's cast_symbool_to_symint_guardless + SymInt toSymInt() const; + bool is_heap_allocated() const { return ptr_; } diff --git a/c10/core/TensorImpl.cpp b/c10/core/TensorImpl.cpp index c59524a0932c2..94a7375cc32fb 100644 --- a/c10/core/TensorImpl.cpp +++ b/c10/core/TensorImpl.cpp @@ -277,7 +277,6 @@ void TensorImpl::release_resources() { if (storage_) { storage_ = {}; } - pyobj_slot_.maybe_destroy_pyobj(); } #ifndef C10_DISABLE_TENSORIMPL_EXTENSIBILITY @@ -989,6 +988,30 @@ void TensorImpl::empty_tensor_restride_symint(MemoryFormat memory_format) { } } +void TensorImpl::incref_pyobject() const { + // Because intrusive_ptr incref uses relaxed memory order, we need to + // do an acquire fence to ensure that the kHasPyObject bit was + // observed before the load of the PyObject* below. + // NB: This is a no-op on x86/x86-64 + std::atomic_thread_fence(std::memory_order_acquire); + + PyObject* obj = pyobj_slot_.load_pyobj(); + (*pyobj_slot_.pyobj_interpreter())->incref(obj); +} + +void TensorImpl::decref_pyobject() const { + PyObject* obj = pyobj_slot_.load_pyobj(); + (*pyobj_slot_.pyobj_interpreter())->decref(obj); +} + +bool TensorImpl::try_incref_pyobject() const { + c10::impl::PyInterpreter* interp = pyobj_slot_.pyobj_interpreter(); + if (C10_UNLIKELY(!interp)) { + return false; + } + return (*interp)->try_incref(pyobj_slot_); +} + namespace impl { namespace { diff --git a/c10/core/TensorImpl.h b/c10/core/TensorImpl.h index 66893b86c8469..71a0195dde773 100644 --- a/c10/core/TensorImpl.h +++ b/c10/core/TensorImpl.h @@ -57,6 +57,8 @@ C10_DECLARE_bool(caffe2_keep_on_shrink); // respect caffe2_keep_on_shrink. C10_DECLARE_int64(caffe2_max_keep_on_shrink_memory); +C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wswitch-default") + namespace at { class Tensor; class TensorBase; @@ -2176,6 +2178,12 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target { return &pyobj_slot_; } + void incref_pyobject() const override final; + + void decref_pyobject() const override final; + + bool try_incref_pyobject() const override final; + private: // See NOTE [std::optional operator usage in CUDA] // We probably don't want to expose this publicly until @@ -3077,6 +3085,19 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target { friend class C10_TensorImpl_Size_Check_Dummy_Class; }; +namespace detail { + +#ifndef C10_MOBILE +template +struct TargetTraits< + T, + std::enable_if_t>>> { + static constexpr bool can_have_pyobject = true; +}; +#endif + +} // namespace detail + // Note [TensorImpl size constraints] // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ // Changed the size of TensorImpl? If the size went down, good for @@ -3303,3 +3324,5 @@ static_assert( #undef C10_GCC_VERSION_MINOR } // namespace c10 + +C10_DIAGNOSTIC_POP() diff --git a/c10/core/TensorOptions.cpp b/c10/core/TensorOptions.cpp index d3282ae7114e5..b1a90cce30edc 100644 --- a/c10/core/TensorOptions.cpp +++ b/c10/core/TensorOptions.cpp @@ -33,7 +33,7 @@ std::ostream& operator<<(std::ostream& stream, const TensorOptions& options) { } else { stream << "(nullopt)"; } - stream << ")"; + stream << ')'; return stream; } diff --git a/c10/core/impl/PyInterpreter.cpp b/c10/core/impl/PyInterpreter.cpp index 8676f0aaf8e0e..52d263fad36c5 100644 --- a/c10/core/impl/PyInterpreter.cpp +++ b/c10/core/impl/PyInterpreter.cpp @@ -11,8 +11,11 @@ struct NoopPyInterpreterVTable final : public PyInterpreterVTable { void incref(PyObject* pyobj) const override {} // do nothing - void decref(PyObject* pyobj, bool has_pyobj_slot) const override { - } // do nothing + void decref(PyObject* pyobj) const override {} // do nothing + + bool try_incref(const c10::impl::PyObjectSlot& pyobj_slot) const override { + return false; + } #define PANIC(m) \ TORCH_INTERNAL_ASSERT( \ @@ -20,6 +23,10 @@ struct NoopPyInterpreterVTable final : public PyInterpreterVTable { "attempted to call " #m \ " on a Tensor with nontrivial PyObject after corresponding interpreter died") + size_t refcnt(PyObject* pyobj) const override { + PANIC(refcnt); + } + c10::intrusive_ptr detach(const TensorImpl* self) const override { PANIC(detach); } diff --git a/c10/core/impl/PyInterpreter.h b/c10/core/impl/PyInterpreter.h index def708c24b802..463b1e520b36e 100644 --- a/c10/core/impl/PyInterpreter.h +++ b/c10/core/impl/PyInterpreter.h @@ -18,6 +18,9 @@ namespace c10 { struct IValue; class OperatorHandle; struct TensorImpl; +namespace impl { +struct PyObjectSlot; +} // namespace impl } // namespace c10 namespace torch::jit { @@ -126,9 +129,12 @@ struct C10_API PyInterpreterVTable { // Run Py_INCREF on a PyObject. virtual void incref(PyObject* pyobj) const = 0; - // Run Py_DECREF on a PyObject. We DO NOT assume the GIL is held on call - // See NOTE [PyInterpreter::decref takes a `has_pyobj_slot` arg] - virtual void decref(PyObject* pyobj, bool has_pyobj_slot) const = 0; + // Run Py_DECREF on a PyObject. We DO NOT assume the GIL is held on call. + virtual void decref(PyObject* pyobj) const = 0; + // Run PyUnstable_TryIncRef on a PyObject if it's not NULL. + virtual bool try_incref(const c10::impl::PyObjectSlot& pyobj_slot) const = 0; + // Run Py_REFCNT on a PyObject. + virtual size_t refcnt(PyObject* pyobj) const = 0; // Perform a detach by deferring to the __torch_dispatch__ implementation of // detach, which will also arrange for the PyObject to get copied in this diff --git a/c10/core/impl/PyObjectSlot.cpp b/c10/core/impl/PyObjectSlot.cpp deleted file mode 100644 index 0f1bfb2110747..0000000000000 --- a/c10/core/impl/PyObjectSlot.cpp +++ /dev/null @@ -1,56 +0,0 @@ -#include - -namespace c10::impl { - -PyObjectSlot::PyObjectSlot() : pyobj_interpreter_(nullptr), pyobj_(nullptr) {} - -PyObjectSlot::~PyObjectSlot() { - maybe_destroy_pyobj(); -} - -void PyObjectSlot::maybe_destroy_pyobj() { - if (owns_pyobj()) { - TORCH_INTERNAL_ASSERT(pyobj_interpreter_ != nullptr); - TORCH_INTERNAL_ASSERT(pyobj_ != nullptr); - (*pyobj_interpreter_.load(std::memory_order_acquire)) - ->decref(_unchecked_untagged_pyobj(), /*has_pyobj_slot*/ true); - // NB: this destructor can only be entered when there are no - // references to this C++ object (obviously), NOR any references - // to the PyObject (if there are references to the PyObject, - // then the PyObject holds an owning reference to the tensor). - // So it is OK to clear pyobj_ here as it is impossible for it to - // be used again (modulo weak reference races) - pyobj_ = nullptr; // for safety - } -} - -PyInterpreter* PyObjectSlot::pyobj_interpreter() { - return pyobj_interpreter_.load(std::memory_order_acquire); -} - -PyObject* PyObjectSlot::_unchecked_untagged_pyobj() const { - // NOLINTNEXTLINE(performance-no-int-to-ptr) - return reinterpret_cast( - reinterpret_cast(pyobj_) & ~0x1ULL); -} - -PyInterpreter& PyObjectSlot::load_pyobj_interpreter() const { - auto interpreter = pyobj_interpreter_.load(std::memory_order_acquire); - if (interpreter) { - return *interpreter; - } - TORCH_CHECK(false, "cannot access PyObject for Tensor - no interpreter set"); -} - -bool PyObjectSlot::owns_pyobj() { - // NOLINTNEXTLINE(performance-no-int-to-ptr) - return reinterpret_cast(pyobj_) & 1; -} - -void PyObjectSlot::set_owns_pyobj(bool b) { - // NOLINTNEXTLINE(performance-no-int-to-ptr) - pyobj_ = reinterpret_cast( - reinterpret_cast(_unchecked_untagged_pyobj()) | b); -} - -} // namespace c10::impl diff --git a/c10/core/impl/PyObjectSlot.h b/c10/core/impl/PyObjectSlot.h index 58b2490eba001..a0633401b3634 100644 --- a/c10/core/impl/PyObjectSlot.h +++ b/c10/core/impl/PyObjectSlot.h @@ -8,117 +8,58 @@ #include +namespace torch::utils { +class PyObjectPreservation; +} + namespace c10::impl { struct C10_API PyObjectSlot { public: - PyObjectSlot(); - - ~PyObjectSlot(); - - void maybe_destroy_pyobj(); - - // Associate the TensorImpl with the specified PyObject, and, if necessary, - // also tag the interpreter. - // - // NB: This lives in a header so that we can inline away the switch on status - // - // NB: THIS FUNCTION CAN RAISE AN EXCEPTION. Make sure to clean up after - // PyObject if necessary! - void init_pyobj(PyObject* pyobj) { - pyobj_interpreter_.store( - getGlobalPyInterpreter(), std::memory_order_relaxed); - pyobj_ = pyobj; - } + PyObjectSlot() : pyobj_interpreter_(nullptr), pyobj_(nullptr) {} // Query the PyObject interpreter. This may return null if there is no - // interpreter. This is racy! - PyInterpreter* pyobj_interpreter(); - - PyObject* _unchecked_untagged_pyobj() const; - - // Test the interpreter tag. If tagged for the current interpreter, return - // a non-nullopt (but possibly null) PyObject. If (possibly) untagged, - // returns a nullopt. If it is definitely invalid, raises an error. - // - // If `ignore_hermetic_tls` is false and this function is called from a - // hermetic context (ie, `HermeticPyObjectTLS::get_state()` is true), then - // nullopt is returned. If `ignore_hermetic_tls` is true, then the hermetic - // context is ignored, allowing you to check the interpreter tag of a - // nonhermetic PyObject from within a hermetic context. This is necessary - // because there are some cases where the deallocator function of a - // nonhermetic PyObject is called from within a hermetic context, so it must - // be properly treated as a nonhermetic PyObject. - // - // NB: this lives in header so that we can avoid actually creating the - // std::optional + // interpreter. + PyInterpreter* pyobj_interpreter() const { + return pyobj_interpreter_.load(std::memory_order_acquire); + } - // @todo alban: I'm not too sure what's going on here, we can probably delete - // it but it's worthwhile making sure - std::optional check_pyobj(bool ignore_hermetic_tls = false) const { - impl::PyInterpreter* interpreter = - pyobj_interpreter_.load(std::memory_order_acquire); - if (interpreter == nullptr) { - return std::nullopt; - } + PyInterpreter& load_pyobj_interpreter() const { + auto interpreter = pyobj_interpreter_.load(std::memory_order_acquire); + TORCH_INTERNAL_ASSERT( + interpreter, "cannot access PyObject for Tensor - no interpreter set"); + return *interpreter; + } - if (!ignore_hermetic_tls && c10::impl::HermeticPyObjectTLS::get_state()) { - return std::nullopt; - } else { - return _unchecked_untagged_pyobj(); - } + PyObject* load_pyobj() const { + return pyobj_.load(std::memory_order_acquire); } - PyInterpreter& load_pyobj_interpreter() const; + void store_pyobj(PyObject* obj) { + pyobj_.store(obj, std::memory_order_release); + } - bool owns_pyobj(); + bool has_unique_reference() const { + PyObject* pyobj = load_pyobj(); + return pyobj != nullptr && load_pyobj_interpreter()->refcnt(pyobj) == 1; + } - void set_owns_pyobj(bool b); + void clear() { + pyobj_.store(nullptr, std::memory_order_relaxed); + pyobj_interpreter_.store(nullptr, std::memory_order_relaxed); + } private: - // This field contains the interpreter tag for this object. See - // Note [Python interpreter tag] for general context - // - // Note [Memory ordering on Python interpreter tag] - // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - // What memory_order do we need when accessing this atomic? We don't - // need a single total modification order (as provided by - // memory_order_seq_cst) as pyobj_interpreter_ is monotonic: it can only - // transition from -1 to some positive integer and never changes afterwards. - // Because there is only one modification, it trivially already has a total - // modification order (e.g., we don't need fences or locked instructions on - // x86) - // - // In fact, one could make a reasonable argument that relaxed reads are OK, - // due to the presence of external locking (GIL) to ensure that interactions - // with other data structures are still correctly synchronized, so that - // we fall in the "Single-Location Data Structures" case as described in - // http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2020/p2055r0.pdf - // However, on x86, it doesn't matter if I use acquire or relaxed on the load - // as I get the same assembly in both cases. So I just use the more - // conservative acquire (which will impede compiler optimizations but I don't - // care) + // This is now always the global interpreter if the PyObject is set. + // Maybe we can remove this field some day... std::atomic pyobj_interpreter_; - // This field contains a reference to a PyObject representing this Tensor. - // If pyobj is nullptr, when we transfer Tensor to Python, we allocate a new - // PyObject for it and set this field. This field does not have to be - // protected by an atomic as it is only allowed to be accessed when you hold - // the GIL, or during destruction of the tensor. - // - // When a PyObject dies, you are obligated to clear this field - // (otherwise, you will try to use-after-free the pyobj); this currently - // occurs in THPVariable_clear in torch/csrc/autograd/python_variable.cpp - // - // NB: Ordinarily, this should not be a strong reference, as if the - // PyObject owns the Tensor, this would create a reference cycle. - // However, sometimes this ownership flips. To track who owns - // who, this has a single pointer tag indicating whether or not the - // C++ object owns the PyObject (the common case, zero, means PyObject - // owns the C++ object); see _unchecked_untagged_pyobj for raw access - // or check_pyobj for checked access. See references to PyObject - // resurrection in torch/csrc/autograd/python_variable.cpp - PyObject* pyobj_; + // The PyObject representing this Tensor or nullptr. Ownership is managed + // by intrusive_ptr. By the time the PyObjectSlot is destroyed, this + // reference is already dead. + std::atomic pyobj_; + + friend class torch::utils::PyObjectPreservation; }; } // namespace c10::impl diff --git a/c10/cuda/CUDAAllocatorConfig.cpp b/c10/cuda/CUDAAllocatorConfig.cpp index 3046259b48a3e..5414d838cd8c4 100644 --- a/c10/cuda/CUDAAllocatorConfig.cpp +++ b/c10/cuda/CUDAAllocatorConfig.cpp @@ -106,6 +106,9 @@ void CUDAAllocatorConfig::parseArgs(const std::string& env) { } else if (key == "graph_capture_record_stream_reuse") { i = parseGraphCaptureRecordStreamReuse(tokenizer, i); used_native_specific_option = true; + } else if (key == "per_process_memory_fraction") { + i = parsePerProcessMemoryFraction(tokenizer, i); + used_native_specific_option = true; } else { const auto& keys = c10::CachingAllocator::AcceleratorAllocatorConfig::getKeys(); @@ -146,6 +149,18 @@ size_t CUDAAllocatorConfig::parseGraphCaptureRecordStreamReuse( return i; } +double CUDAAllocatorConfig::parsePerProcessMemoryFraction( + const c10::CachingAllocator::ConfigTokenizer& tokenizer, + size_t i) { + tokenizer.checkToken(++i, ":"); + double val_env = tokenizer.toDouble(++i); + TORCH_CHECK_VALUE( + val_env >= 0.0 && val_env <= 1.0, + "per_process_memory_fraction is invalid, set it in [0.0, 1.0]"); + m_per_process_memory_fraction = val_env; + return i; +} + size_t CUDAAllocatorConfig::parsePinnedNumRegisterThreads( const c10::CachingAllocator::ConfigTokenizer& tokenizer, size_t i) { diff --git a/c10/cuda/CUDAAllocatorConfig.h b/c10/cuda/CUDAAllocatorConfig.h index d61f69467a2dc..4e6097a406bc2 100644 --- a/c10/cuda/CUDAAllocatorConfig.h +++ b/c10/cuda/CUDAAllocatorConfig.h @@ -61,6 +61,10 @@ class C10_CUDA_API CUDAAllocatorConfig { return instance().m_graph_capture_record_stream_reuse; } + static double per_process_memory_fraction() { + return instance().m_per_process_memory_fraction; + } + /** Pinned memory allocator settings */ static bool pinned_use_cuda_host_register() { return instance().m_pinned_use_cuda_host_register; @@ -152,7 +156,8 @@ class C10_CUDA_API CUDAAllocatorConfig { "pinned_use_hip_host_register", "graph_capture_record_stream_reuse", "pinned_reserve_segment_size_mb", - "pinned_num_register_threads"}; + "pinned_num_register_threads", + "per_process_memory_fraction"}; return keys; } @@ -177,6 +182,9 @@ class C10_CUDA_API CUDAAllocatorConfig { size_t parseGraphCaptureRecordStreamReuse( const c10::CachingAllocator::ConfigTokenizer& tokenizer, size_t i); + double parsePerProcessMemoryFraction( + const c10::CachingAllocator::ConfigTokenizer& tokenizer, + size_t i); std::atomic m_pinned_num_register_threads{1}; std::atomic m_pinned_reserve_segment_size_mb{0}; @@ -189,6 +197,7 @@ class C10_CUDA_API CUDAAllocatorConfig { std::atomic m_release_lock_on_cudamalloc{false}; std::atomic m_pinned_use_cuda_host_register{false}; std::atomic m_graph_capture_record_stream_reuse{false}; + std::atomic m_per_process_memory_fraction{1.0}; }; // Keep this for backwards compatibility diff --git a/c10/cuda/CUDACachingAllocator.cpp b/c10/cuda/CUDACachingAllocator.cpp index 091e580f95819..9e7823a394302 100644 --- a/c10/cuda/CUDACachingAllocator.cpp +++ b/c10/cuda/CUDACachingAllocator.cpp @@ -1012,12 +1012,6 @@ PrivatePoolState::PrivatePoolState( } } -struct MempoolIdHash { - std::size_t operator()(const MempoolId_t& mempool_id) const noexcept { - return mempool_id.first != 0 ? mempool_id.first : mempool_id.second; - } -}; - cudaError_t allocPrimitive(void** ptr, size_t size, AllocParams& p) { if (p.pool->owner_PrivatePool && p.pool->owner_PrivatePool->allocator()) { *ptr = p.pool->owner_PrivatePool->allocator()->raw_alloc(size); @@ -1100,7 +1094,7 @@ class RingBuffer { } // anonymous namespace } // namespace Native -static std::string reportProcessMemoryInfo(c10::DeviceIndex device) { +static std::string reportProcessMemoryInfo(const cudaDeviceProp& prop) { #ifdef PYTORCH_C10_DRIVER_API_SUPPORTED void* nvml_handle = DriverAPI::get_nvml_handle(); if (!nvml_handle) { @@ -1111,9 +1105,6 @@ static std::string reportProcessMemoryInfo(c10::DeviceIndex device) { return true; }(); - cudaDeviceProp prop{}; - C10_CUDA_CHECK(cudaGetDeviceProperties(&prop, device)); - // NOLINTNEXTLINE(*-c-arrays) char pci_id[80]; snprintf( @@ -1215,14 +1206,16 @@ class DeviceCachingAllocator { // record used memory. size_t total_allocated_memory = 0; - size_t allowed_memory_maximum = 0; + cudaDeviceProp device_prop; + + // maximum amount of memory that device is allowed to + // allocate. This is set iff memory fraction is less than 1 + std::optional allowed_memory_maximum{std::nullopt}; // all live expandable segments std::vector expandable_segments_; std::vector devices_with_peer_access_; - bool set_fraction = false; - bool record_history = false; std::atomic context_recorder_; @@ -1264,6 +1257,9 @@ class DeviceCachingAllocator { : device_id(id), large_blocks(/*small=*/false), small_blocks(/*small=*/true) { + C10_CUDA_CHECK(cudaGetDeviceProperties(&device_prop, id)); + + setMemoryFraction(CUDAAllocatorConfig::per_process_memory_fraction()); stats.max_split_size = static_cast(AcceleratorAllocatorConfig::max_split_size()); context_recorder_.store(nullptr); @@ -1399,7 +1395,7 @@ class DeviceCachingAllocator { if (!block_found) { // Do garbage collection if the flag is set. if (C10_UNLIKELY( - set_fraction && + allowed_memory_maximum.has_value() && AcceleratorAllocatorConfig::garbage_collection_threshold() > 0.0)) { garbage_collect_cached_blocks(context); @@ -1456,11 +1452,12 @@ class DeviceCachingAllocator { C10_CUDA_CHECK(cudaMemGetInfo(&device_free, &device_total)); std::string allowed_info; - if (set_fraction) { - allowed_info = format_size(allowed_memory_maximum) + " allowed; "; + if (allowed_memory_maximum.has_value()) { + allowed_info = + format_size(allowed_memory_maximum.value()) + " allowed; "; } - std::string proc_info = reportProcessMemoryInfo(device_id); + std::string proc_info = reportProcessMemoryInfo(device_prop); record_trace( TraceEntry::OOM, @@ -1518,7 +1515,7 @@ class DeviceCachingAllocator { for (const auto& obs : observers_local) { obs(device_id, alloc_size, - set_fraction ? allowed_memory_maximum : device_total, + allowed_memory_maximum.value_or(device_total), device_free); } @@ -2015,25 +2012,26 @@ class DeviceCachingAllocator { /** get memory fraction limiting maximum allocated memory **/ double getMemoryFraction() { - if (!set_fraction) { + if (!allowed_memory_maximum.has_value()) { return 1.0; } - size_t device_free = 0; - size_t device_total = 0; - C10_CUDA_CHECK(cudaMemGetInfo(&device_free, &device_total)); - return static_cast(allowed_memory_maximum) / - static_cast(device_total); + return static_cast(allowed_memory_maximum.value()) / + static_cast(device_prop.totalGlobalMem); } /** set memory fraction to limit maximum allocated memory **/ void setMemoryFraction(double fraction) { - size_t device_free = 0; - size_t device_total = 0; - C10_CUDA_CHECK(cudaMemGetInfo(&device_free, &device_total)); - allowed_memory_maximum = - static_cast(fraction * static_cast(device_total)); - set_fraction = true; + TORCH_CHECK( + 0 <= fraction && fraction <= 1, + "invalid fraction:", + fraction, + ". Please set within [0, 1]."); + allowed_memory_maximum = std::nullopt; + if (fraction < 1.0) { + allowed_memory_maximum = static_cast( + fraction * static_cast(device_prop.totalGlobalMem)); + } } /** get expandable segment size for all the streams on device **/ @@ -3010,7 +3008,7 @@ class DeviceCachingAllocator { BlockPool& pool = *p.pool; if (C10_UNLIKELY( - set_fraction && + allowed_memory_maximum.has_value() && AcceleratorAllocatorConfig::garbage_collection_threshold() > 0.0)) { // Track block reuse interval only when garbage collection is enabled. ++pool.get_free_blocks_call_count; @@ -3083,7 +3081,7 @@ class DeviceCachingAllocator { size_t gc_threshold = static_cast( AcceleratorAllocatorConfig::garbage_collection_threshold() * - static_cast(allowed_memory_maximum)); + static_cast(allowed_memory_maximum.value())); // No need to trigger GC yet if (total_allocated_memory <= gc_threshold) { return; @@ -3161,8 +3159,8 @@ class DeviceCachingAllocator { bool active_pool = p.pool->owner_PrivatePool && p.pool->owner_PrivatePool->allocator(); - if (set_fraction && - total_allocated_memory + size > allowed_memory_maximum) { + if (allowed_memory_maximum.has_value() && + total_allocated_memory + size > allowed_memory_maximum.value()) { p.err = cudaErrorMemoryAllocation; return false; // Temporarily disable checkpointing & cudagraphs internally @@ -3859,7 +3857,6 @@ class NativeCachingAllocator : public CUDAAllocator { "Allocator not initialized for device ", device, ": did you call init?"); - C10_CUDA_CHECK(c10::cuda::SetDevice(device)); return device_allocator[device]->getMemoryFraction(); } @@ -3869,12 +3866,6 @@ class NativeCachingAllocator : public CUDAAllocator { "Allocator not initialized for device ", device, ": did you call init?"); - TORCH_CHECK( - 0 <= fraction && fraction <= 1, - "invalid fraction:", - fraction, - ". Please set within [0, 1]."); - C10_CUDA_CHECK(c10::cuda::SetDevice(device)); device_allocator[device]->setMemoryFraction(fraction); } @@ -4513,66 +4504,3 @@ std::atomic allocator; static BackendStaticInitializer backend_static_initializer; } // namespace cuda::CUDACachingAllocator } // namespace c10 - -namespace c10::cuda { - -// uid_ is incremented when a user creates a MemPool, -// for example: using graph_pool_handle() or c10::cuda::MemPool(). -// -// uuid_ is incremented when CUDAGraph creates a MemPool -// as a result of a user not providing a pool. -// -// MempoolId_t of {0, 0} is used to denote when no MemPool has been -// passed to a function, either by user or CUDAGraphs. For example, -// default value of MempoolId_t for capture_begin function is {0, 0}. -// That's why uid_ and uuid_ start at 1. -std::atomic MemPool::uid_{1}; -std::atomic MemPool::uuid_{1}; - -MemPool::MemPool( - CUDACachingAllocator::CUDAAllocator* allocator, - bool is_user_created, - bool use_on_oom) - : allocator_(allocator), is_user_created_(is_user_created) { - if (is_user_created_) { - id_ = {0, uid_++}; - } else { - id_ = {uuid_++, 0}; - } - device_ = c10::cuda::current_device(); - CUDACachingAllocator::createOrIncrefPool(device_, id_, allocator); - if (use_on_oom) { - CUDACachingAllocator::setUseOnOOM(device_, id_); - } -} - -MemPool::~MemPool() { - TORCH_INTERNAL_ASSERT(use_count() == 1); - CUDACachingAllocator::releasePool(device_, id_); - c10::cuda::CUDACachingAllocator::emptyCache(id_); -} - -MempoolId_t MemPool::id() { - return id_; -} - -CUDACachingAllocator::CUDAAllocator* MemPool::allocator() { - return allocator_; -} - -int MemPool::use_count() { - return CUDACachingAllocator::getPoolUseCount(device_, id_); -} - -c10::DeviceIndex MemPool::device() { - return device_; -} - -MempoolId_t MemPool::graph_pool_handle(bool is_user_created) { - if (is_user_created) { - return {0, uid_++}; - } - return {uuid_++, 0}; -} - -} // namespace c10::cuda diff --git a/c10/cuda/CUDACachingAllocator.h b/c10/cuda/CUDACachingAllocator.h index fbe5dab18e0ae..e7b45072f6c20 100644 --- a/c10/cuda/CUDACachingAllocator.h +++ b/c10/cuda/CUDACachingAllocator.h @@ -2,6 +2,7 @@ #include #include +#include #include #include #include @@ -344,6 +345,13 @@ class CUDAAllocator : public DeviceAllocator { c10::DeviceIndex device, std::shared_ptr pps) = 0; virtual std::string name() = 0; + std::pair getMemoryInfo(c10::DeviceIndex device) override { + c10::DeviceGuard device_guard({at::kCUDA, device}); + size_t free = 0; + size_t total = 0; + C10_CUDA_CHECK(cudaMemGetInfo(&free, &total)); + return {free, total}; + } }; // Allocator object, statically initialized @@ -554,41 +562,7 @@ inline std::string getUserMetadata() { } // namespace c10::cuda::CUDACachingAllocator namespace c10::cuda { - // Keep BC only using c10::CaptureId_t; using c10::MempoolId_t; - -// MemPool represents a pool of memory in a caching allocator. Currently, -// it's just the ID of the pool object maintained in the CUDACachingAllocator. -// -// An allocator pointer can be passed to the MemPool to define how the -// allocations should be done in the pool. For example: using a different -// system allocator such as ncclMemAlloc. -struct C10_CUDA_API MemPool { - MemPool( - CUDACachingAllocator::CUDAAllocator* allocator = nullptr, - bool is_user_created = true, - bool use_on_oom = false); - MemPool(const MemPool&) = delete; - MemPool(MemPool&&) = default; - MemPool& operator=(const MemPool&) = delete; - MemPool& operator=(MemPool&&) = default; - ~MemPool(); - - MempoolId_t id(); - CUDACachingAllocator::CUDAAllocator* allocator(); - int use_count(); - c10::DeviceIndex device(); - static MempoolId_t graph_pool_handle(bool is_user_created = true); - - private: - static std::atomic uid_; - static std::atomic uuid_; - CUDACachingAllocator::CUDAAllocator* allocator_; - bool is_user_created_; - MempoolId_t id_; - c10::DeviceIndex device_; -}; - } // namespace c10::cuda diff --git a/c10/cuda/CUDADeviceAssertionHost.cpp b/c10/cuda/CUDADeviceAssertionHost.cpp index d67ee4b23e692..08e657a411614 100644 --- a/c10/cuda/CUDADeviceAssertionHost.cpp +++ b/c10/cuda/CUDADeviceAssertionHost.cpp @@ -136,7 +136,7 @@ std::string c10_retrieve_device_side_assertion_info() { // Something failed, let's talk about that oss << failures_found << " CUDA device-side assertion failures were found on GPU #" - << device_num << "!" << std::endl; + << device_num << '!' << std::endl; if (assertion_data_for_device.assertion_count > C10_CUDA_DSA_ASSERTION_COUNT) { oss << "But at least " << assertion_data_for_device.assertion_count @@ -151,17 +151,17 @@ std::string c10_retrieve_device_side_assertion_info() { oss << "Assertion failure " << i << std::endl; oss << " GPU assertion failure message = " << self.assertion_msg << std::endl; - oss << " File containing assertion = " << self.filename << ":" + oss << " File containing assertion = " << self.filename << ':' << self.line_number << std::endl; oss << " Device function containing assertion = " << self.function_name << std::endl; - oss << " Thread ID that failed assertion = [" << self.thread_id[0] << "," - << self.thread_id[1] << "," << self.thread_id[2] << "]" << std::endl; - oss << " Block ID that failed assertion = [" << self.block_id[0] << "," - << self.block_id[1] << "," << self.block_id[2] << "]" << std::endl; + oss << " Thread ID that failed assertion = [" << self.thread_id[0] << ',' + << self.thread_id[1] << ',' << self.thread_id[2] << ']' << std::endl; + oss << " Block ID that failed assertion = [" << self.block_id[0] << ',' + << self.block_id[1] << ',' << self.block_id[2] << ']' << std::endl; if (launch_info.generation_number == self.caller) { oss << " File containing kernel launch = " - << launch_info.launch_filename << ":" << launch_info.launch_linenum + << launch_info.launch_filename << ':' << launch_info.launch_linenum << std::endl; oss << " Function containing kernel launch = " << launch_info.launch_function << std::endl; @@ -175,7 +175,7 @@ std::string c10_retrieve_device_side_assertion_info() { if (launch_registry.gather_launch_stacktrace) { oss << "Launch stacktracing disabled." << std::endl; } else { - oss << "\n" << launch_info.launch_stacktrace << std::endl; + oss << '\n' << launch_info.launch_stacktrace << std::endl; } } else { oss << " CPU launch site info: Unavailable, the circular queue wrapped around. Increase `CUDAKernelLaunchRegistry::max_size`." @@ -295,11 +295,19 @@ DeviceAssertionsData* CUDAKernelLaunchRegistry:: C10_CUDA_CHECK_WO_DSA( cudaMallocManaged(&uvm_assertions_ptr, sizeof(DeviceAssertionsData))); +#if CUDART_VERSION >= 13000 + cudaMemLocation cpuDevice; + cpuDevice.type = cudaMemLocationTypeDevice; + cpuDevice.id = cudaCpuDeviceId; +#else + const auto cpuDevice = cudaCpuDeviceId; +#endif + C10_CUDA_CHECK_WO_DSA(cudaMemAdvise( uvm_assertions_ptr, sizeof(DeviceAssertionsData), cudaMemAdviseSetPreferredLocation, - cudaCpuDeviceId)); + cpuDevice)); // GPU will establish direct mapping of data in CPU memory, no page faults // will be generated @@ -307,7 +315,7 @@ DeviceAssertionsData* CUDAKernelLaunchRegistry:: uvm_assertions_ptr, sizeof(DeviceAssertionsData), cudaMemAdviseSetAccessedBy, - cudaCpuDeviceId)); + cpuDevice)); // Initialize the memory from the CPU; otherwise, pages may have to be created // on demand. We think that UVM documentation indicates that first access may diff --git a/c10/cuda/CUDAMallocAsyncAllocator.cpp b/c10/cuda/CUDAMallocAsyncAllocator.cpp index 93bce51f1b9d0..674eb00035c50 100644 --- a/c10/cuda/CUDAMallocAsyncAllocator.cpp +++ b/c10/cuda/CUDAMallocAsyncAllocator.cpp @@ -427,7 +427,6 @@ struct CudaMallocAsyncAllocator : public CUDAAllocator { // on the current device each later call sees. void init(int dev_count) override { static bool called = [](int dev_count) { - ; // Are there external guarantees init will be called before // any of the allocator's other functions? // std::lock_guard lk(general_mutex); diff --git a/c10/cuda/driver_api.h b/c10/cuda/driver_api.h index 380e7939ff76c..1ff0c9a12ac78 100644 --- a/c10/cuda/driver_api.h +++ b/c10/cuda/driver_api.h @@ -20,6 +20,22 @@ } \ } while (0) +#define C10_CUDA_DRIVER_CHECK_GOTO(EXPR, NEXT) \ + do { \ + CUresult __err = EXPR; \ + if (__err != CUDA_SUCCESS) { \ + const char* err_str; \ + CUresult get_error_str_err [[maybe_unused]] = \ + c10::cuda::DriverAPI::get()->cuGetErrorString_(__err, &err_str); \ + if (get_error_str_err != CUDA_SUCCESS) { \ + TORCH_WARN("CUDA driver error: unknown error"); \ + } else { \ + TORCH_WARN("CUDA driver error: ", err_str); \ + } \ + goto NEXT; \ + } \ + } while (0) + // The integer in the second column specifies the requested CUDA Driver API // version. The dynamic loader will accept a driver with a newer version, but it // ensures that the requested symbol exists in *at least* the specified version diff --git a/c10/metal/error.h b/c10/metal/error.h new file mode 100644 index 0000000000000..bed113769747a --- /dev/null +++ b/c10/metal/error.h @@ -0,0 +1,111 @@ +#pragma once +#include + +namespace c10 { +namespace metal { +C10_METAL_CONSTEXPR unsigned error_message_count = 30; +struct ErrorMessage { + char file[128]; + char func[128]; + char message[250]; + unsigned int line; +}; + +struct ErrorMessages { +#ifdef __METAL__ + ::metal::atomic count; +#else + unsigned int count; +#endif + ErrorMessage msg[error_message_count]; +}; + +#ifdef __METAL__ +namespace detail { +static uint strncpy(device char* dst, constant const char* src, unsigned len) { + uint i = 0; + while (src[i] != 0 && i < len - 1) { + dst[i] = src[i]; + i++; + } + dst[i] = 0; + return i; +} + +inline uint print_arg( + device char* ptr, + unsigned len, + constant const char* arg) { + return strncpy(ptr, arg, len); +} + +// Returns number length as string in base10 +static inline uint base10_length(long num) { + uint rc = 1; + if (num < 0) { + num = -num; + rc += 1; + } + while (num > 9) { + num /= 10; + rc++; + } + return rc; +} + +// Converts signed integer to string +inline uint print_arg(device char* ptr, unsigned len, long arg) { + const auto arg_len = base10_length(arg); + if (arg_len >= len) + return 0; + if (arg < 0) { + ptr[0] = '-'; + arg = -arg; + } + uint idx = 1; + do { + ptr[arg_len - idx] = '0' + (arg % 10); + arg /= 10; + idx++; + } while (arg > 0); + ptr[arg_len] = 0; + return arg_len; +} + +template +inline void print_args(device char* ptr, unsigned len, T arg) { + print_arg(ptr, len, arg); +} + +template +inline void print_args(device char* ptr, unsigned len, T arg, Args... args) { + const auto rc = print_arg(ptr, len, arg); + print_args(ptr + rc, len - rc, args...); +} + +} // namespace detail + +template +static void report_error( + device ErrorMessages* msgs, + constant const char* file, + int line, + constant const char* func, + Args... args) { + const auto idx = + atomic_fetch_add_explicit(&msgs->count, 1, ::metal::memory_order_relaxed); + if (idx >= error_message_count) { + return; + } + device auto* msg = &msgs->msg[idx]; + detail::strncpy(msg->file, file, 128); + detail::strncpy(msg->func, func, 128); + detail::print_args(msg->message, 250, args...); + msg->line = line; +} + +#define TORCH_REPORT_ERROR(buf, ...) \ + ::c10::metal::report_error(buf, __FILE__, __LINE__, __func__, __VA_ARGS__) +#endif +} // namespace metal +} // namespace c10 diff --git a/c10/test/build.bzl b/c10/test/build.bzl index deb917dd8fcf3..7b4028ab4afed 100644 --- a/c10/test/build.bzl +++ b/c10/test/build.bzl @@ -66,6 +66,15 @@ def define_targets(rules): ], ) + rules.cc_test( + name = "util/nofatal_test", + srcs = ["util/nofatal_test.cpp"], + deps = [ + "//c10/util:base", + "@com_google_googletest//:gtest_main", + ], + ) + rules.cc_test( name = "util/ssize_test", srcs = ["util/ssize_test.cpp"], diff --git a/c10/test/core/DispatchKeySet_test.cpp b/c10/test/core/DispatchKeySet_test.cpp index a93461a041c39..cdbdc150167e0 100644 --- a/c10/test/core/DispatchKeySet_test.cpp +++ b/c10/test/core/DispatchKeySet_test.cpp @@ -435,7 +435,7 @@ TEST(DispatchKeySet, TestFunctionalityDispatchKeyToString) { if (i > 0) { ASSERT_TRUE(res.find("Unknown") == std::string::npos) << i << " (before is " << toString(static_cast(i - 1)) - << ")"; + << ')'; } else { ASSERT_TRUE(res.find("Unknown") == std::string::npos) << i; } diff --git a/c10/test/util/Half_test.cpp b/c10/test/util/Half_test.cpp index a76814615101b..33c77ead61fc8 100644 --- a/c10/test/util/Half_test.cpp +++ b/c10/test/util/Half_test.cpp @@ -96,10 +96,10 @@ TEST(HalfConversionTest, TestPorableConversion) { for (auto x : inputs) { auto target = c10::detail::fp16_ieee_to_fp32_value(x); EXPECT_EQ(halfbits2float(x), target) - << "Test failed for uint16 to float " << x << "\n"; + << "Test failed for uint16 to float " << x << '\n'; EXPECT_EQ( float2halfbits(target), c10::detail::fp16_ieee_from_fp32_value(target)) - << "Test failed for float to uint16" << target << "\n"; + << "Test failed for float to uint16" << target << '\n'; } } diff --git a/c10/test/util/logging_test.cpp b/c10/test/util/logging_test.cpp index b8fc81ddc6bbe..4587130564dfc 100644 --- a/c10/test/util/logging_test.cpp +++ b/c10/test/util/logging_test.cpp @@ -98,7 +98,7 @@ struct Noncopyable { }; std::ostream& operator<<(std::ostream& out, const Noncopyable& nc) { - out << "Noncopyable(" << nc.x << ")"; + out << "Noncopyable(" << nc.x << ')'; return out; } } // namespace diff --git a/c10/test/util/nofatal_test.cpp b/c10/test/util/nofatal_test.cpp new file mode 100644 index 0000000000000..ba4b40b6f917e --- /dev/null +++ b/c10/test/util/nofatal_test.cpp @@ -0,0 +1,53 @@ +#include + +#include +#include + +namespace { +template +inline void expectThrowsEq(T&& fn, const char* expected_msg) { + try { + std::forward(fn)(); + } catch (const c10::Error& e) { + EXPECT_TRUE( + std::string(e.what_without_backtrace()).find(expected_msg) != + std::string::npos); + return; + } + ADD_FAILURE() << "Expected to throw exception with message \"" << expected_msg + << "\" but didn't throw"; +} +} // namespace + +TEST(NofatalTest, TorchCheckComparisons) { + // quick make sure that no-op works as expected + TORCH_CHECK_EQ(1, 1) << "i am a silly message " << 1; + expectThrowsEq( + []() { TORCH_CHECK_EQ(1, 2) << "i am a silly message " << 1; }, + "Check failed: 1 == 2 (1 vs. 2). i am a silly message 1"); + expectThrowsEq( + []() { TORCH_CHECK_NE(2, 2); }, "Check failed: 2 != 2 (2 vs. 2)."); + expectThrowsEq( + []() { TORCH_CHECK_LT(2, 2); }, "Check failed: 2 < 2 (2 vs. 2)."); + expectThrowsEq( + []() { TORCH_CHECK_LE(3, 2); }, "Check failed: 3 <= 2 (3 vs. 2)."); + expectThrowsEq( + []() { TORCH_CHECK_GT(2, 2); }, "Check failed: 2 > 2 (2 vs. 2)."); + expectThrowsEq( + []() { TORCH_CHECK_GE(2, 3); }, "Check failed: 2 >= 3 (2 vs. 3)."); + expectThrowsEq( + []() { + void* p = nullptr; + TORCH_CHECK_NOTNULL(p); + }, + "Check failed: 'p' must be non NULL."); + +#if GTEST_HAS_DEATH_TEST +#ifndef NDEBUG + // if dbg build, DCHECK should result in deth + EXPECT_DEATH(TORCH_DCHECK_EQ(1, 2), "Check failed"); +#else + TORCH_DCHECK_EQ(1, 2); // no-op +#endif +#endif // GTEST_HAS_DEATH_TEST +} diff --git a/c10/util/ArrayRef.h b/c10/util/ArrayRef.h index 64605f5153595..55900b6ee43c6 100644 --- a/c10/util/ArrayRef.h +++ b/c10/util/ArrayRef.h @@ -18,6 +18,7 @@ #include #include #include +#include #include #include @@ -40,200 +41,105 @@ namespace c10 { /// /// This is intended to be trivially copyable, so it should be passed by /// value. +/// +/// NOTE: We have refactored out the headeronly parts of the ArrayRef struct +/// into HeaderOnlyArrayRef. As adding `virtual` would change the performance of +/// the underlying constexpr calls, we rely on apparent-type dispatch for +/// inheritance. This should be fine because their memory format is the same, +/// and it is never incorrect for ArrayRef to call HeaderOnlyArrayRef methods. +/// However, you should prefer to use ArrayRef when possible, because its use +/// of TORCH_CHECK will lead to better user-facing error messages. template -class ArrayRef final { +// ArrayRef cannot be derived from. Normally, we would use `final` +// specifier to force this constraint at compile time. However, Intel +// compiler does not recognize ArrayRef as a class template (which is +// required in the definition of at::TensorAccessor, for instance) +// when `final` specifier is used. So, we cannot define ArrayRef as +// final because of the Intel compiler issue. +class ArrayRef : public HeaderOnlyArrayRef { public: - using iterator = const T*; - using const_iterator = const T*; - using size_type = size_t; - using value_type = T; - - using reverse_iterator = std::reverse_iterator; - - private: - /// The start of the array, in an external buffer. - const T* Data; - - /// The number of elements. - size_type Length; - - void debugCheckNullptrInvariant() { - TORCH_INTERNAL_ASSERT_DEBUG_ONLY( - Data != nullptr || Length == 0, - "created ArrayRef with nullptr and non-zero length! std::optional relies on this being illegal"); - } - - public: - /// @name Constructors + /// @name Constructors, all inherited from HeaderOnlyArrayRef except for + /// SmallVector. As inherited constructors won't work with class template + /// argument deduction (CTAD) until C++23, we add deduction guides after + /// the class definition to enable CTAD. /// @{ - /// Construct an empty ArrayRef. - /* implicit */ constexpr ArrayRef() : Data(nullptr), Length(0) {} - - /// Construct an ArrayRef from a single element. - // TODO Make this explicit - constexpr ArrayRef(const T& OneElt) : Data(&OneElt), Length(1) {} - - /// Construct an ArrayRef from a pointer and length. - constexpr ArrayRef(const T* data, size_t length) - : Data(data), Length(length) { - debugCheckNullptrInvariant(); - } - - /// Construct an ArrayRef from a range. - constexpr ArrayRef(const T* begin, const T* end) - : Data(begin), Length(end - begin) { - debugCheckNullptrInvariant(); - } + using HeaderOnlyArrayRef::HeaderOnlyArrayRef; /// Construct an ArrayRef from a SmallVector. This is templated in order to /// avoid instantiating SmallVectorTemplateCommon whenever we /// copy-construct an ArrayRef. + /// NOTE: this is the only constructor that is not inherited from + /// HeaderOnlyArrayRef. template /* implicit */ ArrayRef(const SmallVectorTemplateCommon& Vec) - : Data(Vec.data()), Length(Vec.size()) { - debugCheckNullptrInvariant(); - } - - template < - typename Container, - typename U = decltype(std::declval().data()), - typename = std::enable_if_t< - (std::is_same_v || std::is_same_v)>> - /* implicit */ ArrayRef(const Container& container) - : Data(container.data()), Length(container.size()) { - debugCheckNullptrInvariant(); - } - - /// Construct an ArrayRef from a std::vector. - // The enable_if stuff here makes sure that this isn't used for - // std::vector, because ArrayRef can't work on a std::vector - // bitfield. - template - /* implicit */ ArrayRef(const std::vector& Vec) - : Data(Vec.data()), Length(Vec.size()) { - static_assert( - !std::is_same_v, - "ArrayRef cannot be constructed from a std::vector bitfield."); - } - - /// Construct an ArrayRef from a std::array - template - /* implicit */ constexpr ArrayRef(const std::array& Arr) - : Data(Arr.data()), Length(N) {} - - /// Construct an ArrayRef from a C array. - template - // NOLINTNEXTLINE(*c-arrays*) - /* implicit */ constexpr ArrayRef(const T (&Arr)[N]) : Data(Arr), Length(N) {} - - /// Construct an ArrayRef from a std::initializer_list. - /* implicit */ constexpr ArrayRef(const std::initializer_list& Vec) - : Data( - std::begin(Vec) == std::end(Vec) ? static_cast(nullptr) - : std::begin(Vec)), - Length(Vec.size()) {} + : HeaderOnlyArrayRef(Vec.data(), Vec.size()) {} /// @} - /// @name Simple Operations + /// @name Simple Operations, mostly inherited from HeaderOnlyArrayRef /// @{ - constexpr iterator begin() const { - return Data; - } - constexpr iterator end() const { - return Data + Length; - } - - // These are actually the same as iterator, since ArrayRef only - // gives you const iterators. - constexpr const_iterator cbegin() const { - return Data; - } - constexpr const_iterator cend() const { - return Data + Length; - } - - constexpr reverse_iterator rbegin() const { - return reverse_iterator(end()); - } - constexpr reverse_iterator rend() const { - return reverse_iterator(begin()); - } - - /// Check if all elements in the array satisfy the given expression - constexpr bool allMatch(const std::function& pred) const { - return std::all_of(cbegin(), cend(), pred); - } - - /// empty - Check if the array is empty. - constexpr bool empty() const { - return Length == 0; - } - - constexpr const T* data() const { - return Data; - } - - /// size - Get the array size. - constexpr size_t size() const { - return Length; - } - /// front - Get the first element. + /// We deviate from HeaderOnlyArrayRef by using TORCH_CHECK instead of + /// STD_TORCH_CHECK constexpr const T& front() const { TORCH_CHECK( - !empty(), "ArrayRef: attempted to access front() of empty list"); - return Data[0]; + !this->empty(), "ArrayRef: attempted to access front() of empty list"); + return this->Data[0]; } /// back - Get the last element. + /// We deviate from HeaderOnlyArrayRef by using TORCH_CHECK instead of + /// STD_TORCH_CHECK constexpr const T& back() const { - TORCH_CHECK(!empty(), "ArrayRef: attempted to access back() of empty list"); - return Data[Length - 1]; - } - - /// equals - Check for element-wise equality. - constexpr bool equals(ArrayRef RHS) const { - return Length == RHS.Length && std::equal(begin(), end(), RHS.begin()); + TORCH_CHECK( + !this->empty(), "ArrayRef: attempted to access back() of empty list"); + return this->Data[this->Length - 1]; } /// slice(n, m) - Take M elements of the array starting at element N + /// We deviate from HeaderOnlyArrayRef by using TORCH_CHECK instead of + /// STD_TORCH_CHECK constexpr ArrayRef slice(size_t N, size_t M) const { TORCH_CHECK( - N + M <= size(), + N + M <= this->size(), "ArrayRef: invalid slice, N = ", N, "; M = ", M, "; size = ", - size()); - return ArrayRef(data() + N, M); + this->size()); + return ArrayRef(this->data() + N, M); } /// slice(n) - Chop off the first N elements of the array. + /// We deviate from HeaderOnlyArrayRef by using TORCH_CHECK instead of + /// STD_TORCH_CHECK constexpr ArrayRef slice(size_t N) const { TORCH_CHECK( - N <= size(), "ArrayRef: invalid slice, N = ", N, "; size = ", size()); - return slice(N, size() - N); + N <= this->size(), + "ArrayRef: invalid slice, N = ", + N, + "; size = ", + this->size()); + return slice(N, this->size() - N); // should this slice be this->slice? } /// @} /// @name Operator Overloads /// @{ - constexpr const T& operator[](size_t Index) const { - return Data[Index]; - } /// Vector compatibility + /// We deviate from HeaderOnlyArrayRef by using TORCH_CHECK instead of + /// STD_TORCH_CHECK constexpr const T& at(size_t Index) const { TORCH_CHECK( - Index < Length, + Index < this->Length, "ArrayRef: invalid index Index = ", Index, "; Length = ", - Length); - return Data[Index]; + this->Length); + return this->Data[Index]; } /// Disallow accidental assignment from a temporary. @@ -253,26 +159,58 @@ class ArrayRef final { std::enable_if_t, ArrayRef>& operator=( std::initializer_list) = delete; - /// @} - /// @name Expensive Operations - /// @{ - std::vector vec() const { - return std::vector(Data, Data + Length); - } - /// @} }; +/// Deduction guides for ArrayRef to support CTAD with inherited constructors +/// These mirror the constructors inherited from HeaderOnlyArrayRef +/// @{ + +// Single element constructor +template +ArrayRef(const T&) -> ArrayRef; + +// Pointer and length constructor +template +ArrayRef(const T*, size_t) -> ArrayRef; + +// Range constructor (begin, end) +template +ArrayRef(const T*, const T*) -> ArrayRef; + +// Generic container constructor (anything with .data() and .size()) +template +ArrayRef(const Container&) -> ArrayRef< + std::remove_pointer_t().data())>>; + +// std::vector constructor +template +ArrayRef(const std::vector&) -> ArrayRef; + +// std::array constructor +template +ArrayRef(const std::array&) -> ArrayRef; + +// C array constructor +template +ArrayRef(const T (&)[N]) -> ArrayRef; + +// std::initializer_list constructor +template +ArrayRef(const std::initializer_list&) -> ArrayRef; + +/// @} + template std::ostream& operator<<(std::ostream& out, ArrayRef list) { int i = 0; - out << "["; + out << '['; for (const auto& e : list) { if (i++ > 0) out << ", "; out << e; } - out << "]"; + out << ']'; return out; } diff --git a/c10/util/Backtrace.cpp b/c10/util/Backtrace.cpp index 8838cafb029e4..29dbfe427ae01 100644 --- a/c10/util/Backtrace.cpp +++ b/c10/util/Backtrace.cpp @@ -106,8 +106,8 @@ class GetBacktraceImpl { /*length*/ &length, /*status*/ &status); - os << " frame #" << idx++ << "\t" - << ((demangled != NULL && status == 0) ? demangled : symbol) << "[" + os << " frame #" << idx++ << '\t' + << ((demangled != NULL && status == 0) ? demangled : symbol) << '[' << addr << "]\t" << std::endl; } free(demangled); @@ -274,7 +274,7 @@ class GetBacktraceImpl { } else { // In the edge-case where we couldn't parse the frame string, we can // just use it directly (it may have a different format). - stream << symbols[frame_number] << "\n"; + stream << symbols[frame_number] << '\n'; } } @@ -413,8 +413,8 @@ class GetBacktraceImpl { << back_trace_[i_frame] << std::dec; if (with_symbol) { stream << std::setfill('0') << std::setw(16) << std::uppercase - << std::hex << p_symbol->Address << std::dec << " " << module - << "!" << p_symbol->Name; + << std::hex << p_symbol->Address << std::dec << ' ' << module + << '!' << p_symbol->Name; } else { stream << " " << module << "!"; } @@ -424,7 +424,7 @@ class GetBacktraceImpl { } else { stream << " @ "; } - stream << "]" << std::endl; + stream << ']' << std::endl; } return stream.str(); diff --git a/c10/util/Exception.cpp b/c10/util/Exception.cpp index 1928c2c175c7b..50f423f917981 100644 --- a/c10/util/Exception.cpp +++ b/c10/util/Exception.cpp @@ -44,7 +44,7 @@ std::string Error::compute_what(bool include_backtrace) const { if (context_.size() == 1) { // Fold error and context in one line - oss << " (" << context_[0] << ")"; + oss << " (" << context_[0] << ')'; } else { for (const auto& c : context_) { oss << "\n " << c; @@ -52,7 +52,7 @@ std::string Error::compute_what(bool include_backtrace) const { } if (include_backtrace && backtrace_) { - oss << "\n" << backtrace_->get(); + oss << '\n' << backtrace_->get(); } return oss.str(); @@ -247,7 +247,7 @@ void WarningHandler::process(const Warning& warning) { LOG_AT_FILE_LINE( WARNING, warning.source_location().file, warning.source_location().line) << "Warning: " << warning.msg() << " (function " - << warning.source_location().function << ")"; + << warning.source_location().function << ')'; } std::string GetExceptionString(const std::exception& e) { diff --git a/c10/util/Exception.h b/c10/util/Exception.h index 6b2fd626bfb5e..a4537c862ae7b 100644 --- a/c10/util/Exception.h +++ b/c10/util/Exception.h @@ -379,7 +379,11 @@ C10_API std::string GetExceptionString(const std::exception& e); // ---------------------------------------------------------------------------- #ifdef STRIP_ERROR_MESSAGES -#define TORCH_RETHROW(e, ...) throw +#define TORCH_RETHROW(e, ...) \ + do { \ + (void)e; /* Suppress unused variable warning */ \ + throw; \ + } while (false) #else #define TORCH_RETHROW(e, ...) \ do { \ @@ -702,6 +706,98 @@ namespace c10::detail { #define TORCH_CHECK_ARG(cond, argN, ...) \ TORCH_CHECK(cond, "invalid argument ", argN, ": ", __VA_ARGS__) +#ifndef FATAL_IF +#ifdef C10_USE_GLOG +#define FATAL_IF(condition) \ + condition ? (void)0 \ + : ::c10::LoggerVoidify() & \ + ::c10::MessageLogger(__FILE__, __LINE__, ::google::GLOG_FATAL) \ + .stream() +#else +#define FATAL_IF(condition) \ + condition ? (void)0 \ + : ::c10::LoggerVoidify() & \ + ::c10::MessageLogger(__FILE__, __LINE__, ::c10::GLOG_FATAL).stream() +#endif +#endif + +#ifndef NON_FATAL_IF +#ifdef C10_USE_GLOG +#define NON_FATAL_IF(condition) \ + condition ? (void)0 \ + : ::c10::LoggerVoidify() & \ + ::c10::MessageLogger( \ + __FILE__, __LINE__, ::google::GLOG_FATAL, false) \ + .stream() +#else +#define NON_FATAL_IF(condition) \ + condition ? (void)0 \ + : ::c10::LoggerVoidify() & \ + ::c10::MessageLogger(__FILE__, __LINE__, ::c10::GLOG_FATAL, false) \ + .stream() +#endif +#endif + +// Binary comparison check macros +#define TORCH_CHECK_OP(val1, val2, op) \ + NON_FATAL_IF(((val1)op(val2))) \ + << "Check failed: " #val1 " " #op " " #val2 " (" << (val1) << " vs. " \ + << (val2) << "). " + +#define TORCH_DCHECK_OP(val1, val2, op) \ + FATAL_IF(((val1)op(val2))) << "Check failed: " #val1 " " #op " " #val2 " (" \ + << (val1) << " vs. " << (val2) << "). " + +#define TORCH_CHECK_EQ(val1, val2) TORCH_CHECK_OP(val1, val2, ==) +#define TORCH_CHECK_NE(val1, val2) TORCH_CHECK_OP(val1, val2, !=) +#define TORCH_CHECK_LE(val1, val2) TORCH_CHECK_OP(val1, val2, <=) +#define TORCH_CHECK_LT(val1, val2) TORCH_CHECK_OP(val1, val2, <) +#define TORCH_CHECK_GE(val1, val2) TORCH_CHECK_OP(val1, val2, >=) +#define TORCH_CHECK_GT(val1, val2) TORCH_CHECK_OP(val1, val2, >) + +// Debug versions of TORCH_CHECK_OP macros +#ifndef NDEBUG +#define TORCH_DCHECK_EQ(val1, val2) TORCH_DCHECK_OP(val1, val2, ==) +#define TORCH_DCHECK_NE(val1, val2) TORCH_DCHECK_OP(val1, val2, !=) +#define TORCH_DCHECK_LE(val1, val2) TORCH_DCHECK_OP(val1, val2, <=) +#define TORCH_DCHECK_LT(val1, val2) TORCH_DCHECK_OP(val1, val2, <) +#define TORCH_DCHECK_GE(val1, val2) TORCH_DCHECK_OP(val1, val2, >=) +#define TORCH_DCHECK_GT(val1, val2) TORCH_DCHECK_OP(val1, val2, >) +#else // !NDEBUG +// Optimized versions - generate no code +#define TORCH_DCHECK_EQ(val1, val2) \ + while (false) \ + TORCH_DCHECK_OP(val1, val2, ==) +#define TORCH_DCHECK_NE(val1, val2) \ + while (false) \ + TORCH_DCHECK_OP(val1, val2, !=) +#define TORCH_DCHECK_LE(val1, val2) \ + while (false) \ + TORCH_DCHECK_OP(val1, val2, <=) +#define TORCH_DCHECK_LT(val1, val2) \ + while (false) \ + TORCH_DCHECK_OP(val1, val2, <) +#define TORCH_DCHECK_GE(val1, val2) \ + while (false) \ + TORCH_DCHECK_OP(val1, val2, >=) +#define TORCH_DCHECK_GT(val1, val2) \ + while (false) \ + TORCH_DCHECK_OP(val1, val2, >) +#endif // NDEBUG + +// Null pointer check macro +#define TORCH_CHECK_NOTNULL(val) \ + ::c10::CheckNotNull(__FILE__, __LINE__, #val, (val), false) + +#ifndef NDEBUG +#define TORCH_DCHECK_NOTNULL(val) \ + ::c10::CheckNotNull(__FILE__, __LINE__, #val, (val), true) +#else // !NDEBUG +#define TORCH_DCHECK_NOTNULL(val) \ + while (false) \ + TORCH_CHECK_NOTNULL(val) +#endif // NDEBUG + // ---------------------------------------------------------------------------- // Deprecated macros // ---------------------------------------------------------------------------- diff --git a/c10/util/Logging.cpp b/c10/util/Logging.cpp index 555ab685c0b5f..298503dfbe340 100644 --- a/c10/util/Logging.cpp +++ b/c10/util/Logging.cpp @@ -291,6 +291,32 @@ namespace c10 { using fLB::FLAGS_logtostderr; using fLI::FLAGS_minloglevel; using fLI::FLAGS_v; + +MessageLogger::MessageLogger( + const char* file, + int line, + int severity, + bool exit_on_fatal) + : stream_(), severity_(severity), exit_on_fatal_(exit_on_fatal) {} + +MessageLogger::~MessageLogger() noexcept(false) { + if (severity_ == ::google::GLOG_FATAL) { + DealWithFatal(); + } +} + +std::stringstream& MessageLogger::stream() { + return stream_; +} + +void MessageLogger::DealWithFatal() { + if (exit_on_fatal_) { + LOG(FATAL) << stream_.str(); + } else { + throw c10::Error(stream_.str(), nullptr, nullptr); + } +} + } // namespace c10 C10_DEFINE_int( @@ -412,17 +438,16 @@ void ShowLogInfoToStderr() { FLAGS_caffe2_log_level = GLOG_INFO; } -MessageLogger::MessageLogger(const char* file, int line, int severity) - : severity_(severity) { +MessageLogger::MessageLogger( + const char* file, + int line, + int severity, + bool exit_on_fatal) + : severity_(severity), exit_on_fatal_(exit_on_fatal) { if (severity_ < FLAGS_caffe2_log_level) { // Nothing needs to be logged. return; } -#ifdef ANDROID - tag_ = "native"; -#else // !ANDROID - tag_ = ""; -#endif // ANDROID time_t rawtime = 0; time(&rawtime); @@ -448,22 +473,22 @@ MessageLogger::MessageLogger(const char* file, int line, int severity) if (GLOBAL_RANK != -1) { stream_ << "[rank" << GLOBAL_RANK << "]:"; } - stream_ << "[" << CAFFE2_SEVERITY_PREFIX[std::min(4, GLOG_FATAL - severity_)] + stream_ << '[' << CAFFE2_SEVERITY_PREFIX[std::min(4, GLOG_FATAL - severity_)] << (timeinfo->tm_mon + 1) * 100 + timeinfo->tm_mday - << std::setfill('0') << " " << std::setw(2) << timeinfo->tm_hour - << ":" << std::setw(2) << timeinfo->tm_min << ":" << std::setw(2) - << timeinfo->tm_sec << "." << std::setw(9) << ns << " " - << c10::detail::StripBasename(std::string(file)) << ":" << line + << std::setfill('0') << ' ' << std::setw(2) << timeinfo->tm_hour + << ':' << std::setw(2) << timeinfo->tm_min << ':' << std::setw(2) + << timeinfo->tm_sec << '.' << std::setw(9) << ns << ' ' + << c10::detail::StripBasename(std::string(file)) << ':' << line << "] "; } // Output the contents of the stream to the proper channel on destruction. -MessageLogger::~MessageLogger() { +MessageLogger::~MessageLogger() noexcept(false) { if (severity_ < FLAGS_caffe2_log_level) { // Nothing needs to be logged. return; } - stream_ << "\n"; + stream_ << '\n'; #ifdef ANDROID static const int android_log_levels[] = { ANDROID_LOG_FATAL, // LOG_FATAL @@ -498,6 +523,18 @@ MessageLogger::~MessageLogger() { } } +std::stringstream& MessageLogger::stream() { + return stream_; +} + +void MessageLogger::DealWithFatal() { + if (exit_on_fatal_) { + abort(); + } else { + throw c10::Error(stream_.str(), nullptr, nullptr); + } +} + } // namespace c10 #endif // !C10_USE_GLOG diff --git a/c10/util/Metaprogramming.cpp b/c10/util/Metaprogramming.cpp deleted file mode 100644 index f6ee24a79bcd8..0000000000000 --- a/c10/util/Metaprogramming.cpp +++ /dev/null @@ -1 +0,0 @@ -#include diff --git a/c10/util/Metaprogramming.h b/c10/util/Metaprogramming.h index d504706f3283a..a5912706e1ed1 100644 --- a/c10/util/Metaprogramming.h +++ b/c10/util/Metaprogramming.h @@ -1,224 +1 @@ -#pragma once - -#include -#include - -namespace c10::guts { - -/** - * Access information about result type or arguments from a function type. - * Example: - * using A = function_traits::return_type // A == int - * using A = function_traits::parameter_types::tuple_type - * // A == tuple - */ -template -struct function_traits { - static_assert( - !std::is_same_v, - "In function_traits, Func must be a plain function type."); -}; -template -struct function_traits { - using func_type = Result(Args...); - using return_type = Result; - using parameter_types = typelist::typelist; - static constexpr auto number_of_parameters = sizeof...(Args); -}; - -/** - * infer_function_traits: creates a `function_traits` type for a simple - * function (pointer) or functor (lambda/struct). Currently does not support - * class methods. - */ - -template -struct infer_function_traits { - using type = function_traits< - c10::guts::detail::strip_class_t>; -}; - -template -struct infer_function_traits { - using type = function_traits; -}; - -template -struct infer_function_traits { - using type = function_traits; -}; - -template -using infer_function_traits_t = typename infer_function_traits::type; - -/** - * make_function_traits: creates a `function_traits` type given a Return type - * and a typelist of Argument types - * - * Example: - * bool f(int, int); - * - * infer_function_traits_t == make_function_traits_t> - */ -template -struct make_function_traits { - static_assert( - false_t::value, - "In guts::make_function_traits, the ArgList argument must be typelist<...>."); -}; - -template -struct make_function_traits> { - using type = function_traits; -}; - -template -using make_function_traits_t = - typename make_function_traits::type; - -/** - * make_offset_index_sequence - * Like make_index_sequence, but starting from Start instead of 0. - * - * Example: - * make_offset_index_sequence<10, 3> == std::index_sequence<10, 11, 12> - */ -template -struct make_offset_index_sequence_impl - : make_offset_index_sequence_impl { - static_assert( - static_cast(Start) >= 0, - "make_offset_index_sequence: Start < 0"); - static_assert(static_cast(N) >= 0, "make_offset_index_sequence: N < 0"); -}; - -template -struct make_offset_index_sequence_impl { - typedef std::index_sequence type; -}; - -template -using make_offset_index_sequence = - typename make_offset_index_sequence_impl::type; - -/** - * Use tuple_elements to extract a position-indexed subset of elements - * from the argument tuple into a result tuple. - * - * Example: - * std::tuple t = std::make_tuple(0, "HEY", 2.0); - * std::tuple result = tuple_elements(t, std::index_sequence<0, - * 2>()); - */ -template -constexpr auto tuple_elements(Tuple t, std::index_sequence /*unused*/) { - return std::tuple...>(std::get(t)...); -} - -/** - * Use tuple_take to extract the first or last n elements from the argument - * tuple into a result tuple. - * - * Example: - * std::tuple t = std::make_tuple(0, "HEY", 2.0); - * std::tuple first_two = tuple_take(t); - * std::tuple last_two = tuple_take(t); - */ -template -struct TupleTake {}; - -template -struct TupleTake= 0, void>> { - static auto call(Tuple t) { - constexpr size_t size = std::tuple_size(); - static_assert(N <= size, "tuple_take: N > size"); - return tuple_elements(t, std::make_index_sequence{}); - } -}; - -template - struct TupleTake < Tuple, - N, std::enable_if_t> { - static auto call(Tuple t) { - constexpr size_t size = std::tuple_size(); - static_assert(-N <= size, "tuple_take: -N > size"); - return tuple_elements(t, make_offset_index_sequence{}); - } -}; - -template -auto tuple_take(Tuple t) { - return TupleTake::call(t); -} - -/** - * Use tuple_slice to extract a contiguous subtuple from the argument. - * - * Example: - * std::tuple t = std::make_tuple(0, - * "HEY", 2.0, false); std::tuple middle_two = - * tuple_slice(t); - */ -template -constexpr auto tuple_slice(Tuple t) { - constexpr size_t size = std::tuple_size(); - static_assert(Start + N <= size, "tuple_slice: Start + N > size"); - return tuple_elements(t, make_offset_index_sequence{}); -} - -/** - * Use tuple_map to run a mapping function over a tuple to get a new tuple. - * - * Example 1: - * auto result = tuple_map(std::tuple(3, 4, 5), [] - * (int32_t a) -> int16_t {return a+1;}); - * // result == std::tuple(4, 5, 6) - * - * Example 2: - * struct Mapper { - * std::string operator()(int32_t a) const { - * return std::to_string(a); - * } - * int64_t operator()(const std::string& a) const { - * return atoi(a.c_str()); - * } - * }; - * auto result = tuple_map(std::tuple(3, "4"), - * Mapper()); - * // result == std::tuple("3", 4) - * - * Example 3: - * struct A final { - * int32_t func() { - * return 5; - * } - * }; - * struct B final { - * std::string func() { - * return "5"; - * } - * }; - * auto result = tuple_map(std::make_tuple(A(), B()), [] (auto a) { return - * a.func(); }); - * // result == std::tuple(5, "5"); - */ -namespace detail { -template -auto tuple_map( - // NOLINTNEXTLINE(cppcoreguidelines-rvalue-reference-param-not-moved) - std::tuple&& tuple, - const Mapper& mapper, - std::index_sequence /*unused*/) { - return std::tuple(std::get( - tuple))))...>(mapper(std::forward(std::get(tuple)))...); -} -} // namespace detail - -template -auto tuple_map(std::tuple&& tuple, const Mapper& mapper) { - return detail::tuple_map( - std::move(tuple), mapper, std::index_sequence_for()); -} - -} // namespace c10::guts +#include diff --git a/c10/util/SmallVector.h b/c10/util/SmallVector.h index d02c9380a563d..d47f37cdf7eca 100644 --- a/c10/util/SmallVector.h +++ b/c10/util/SmallVector.h @@ -1412,13 +1412,13 @@ inline size_t capacity_in_bytes(const SmallVector& X) { template std::ostream& operator<<(std::ostream& out, const SmallVector& list) { int i = 0; - out << "["; + out << '['; for (auto e : list) { if (i++ > 0) out << ", "; out << e; } - out << "]"; + out << ']'; return out; } diff --git a/c10/util/StringUtil.cpp b/c10/util/StringUtil.cpp index 063a8fc93ea7a..6fae2f004cc93 100644 --- a/c10/util/StringUtil.cpp +++ b/c10/util/StringUtil.cpp @@ -79,7 +79,7 @@ std::ostream& _str(std::ostream& ss, const std::wstring& wString) { } // namespace detail std::ostream& operator<<(std::ostream& out, const SourceLocation& loc) { - out << loc.function << " at " << loc.file << ":" << loc.line; + out << loc.function << " at " << loc.file << ':' << loc.line; return out; } diff --git a/c10/util/StringUtil.h b/c10/util/StringUtil.h index cbc6f4ec336bb..de241bc9f7c45 100644 --- a/c10/util/StringUtil.h +++ b/c10/util/StringUtil.h @@ -170,7 +170,7 @@ inline bool isPrint(char s) { } inline void printQuotedString(std::ostream& stmt, const std::string_view str) { - stmt << "\""; + stmt << '"'; for (auto s : str) { switch (s) { case '\\': @@ -224,7 +224,7 @@ inline void printQuotedString(std::ostream& stmt, const std::string_view str) { break; } } - stmt << "\""; + stmt << '"'; } template diff --git a/c10/util/TypeList.h b/c10/util/TypeList.h index 244e5bb141cd7..9f79099710d71 100644 --- a/c10/util/TypeList.h +++ b/c10/util/TypeList.h @@ -1,515 +1 @@ -#pragma once - -#include -#include -#include -#include -#include -#include - -namespace c10::guts { - -template -struct false_t : std::false_type {}; -template