diff --git a/.automation_scripts/run_pytorch_unit_tests.py b/.automation_scripts/run_pytorch_unit_tests.py index fff20bf64fec..1e06c9eecf0f 100644 --- a/.automation_scripts/run_pytorch_unit_tests.py +++ b/.automation_scripts/run_pytorch_unit_tests.py @@ -338,7 +338,11 @@ def run_test_and_summarize_results( # copy current environment variables _environ = dict(os.environ) +<<<<<<< HEAD +======= + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # modify path test_shell_path = pytorch_root_dir + "/.ci/pytorch/test.sh" test_run_test_path = pytorch_root_dir + "/test/run_test.py" @@ -385,6 +389,13 @@ def run_test_and_summarize_results( global CONSOLIDATED_LOG_FILE_PATH CONSOLIDATED_LOG_FILE_PATH = overall_logs_path_current_run + CONSOLIDATED_LOG_FILE_NAME +<<<<<<< HEAD +======= + # Check multi gpu availability if distributed tests are enabled + if ("distributed" in test_config) or len(distributed_list) != 0: + check_num_gpus_for_distributed() + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # Install test requirements command = "pip3 install -r requirements.txt && pip3 install -r .ci/docker/requirements-ci.txt" run_command_and_capture_output(command) @@ -393,15 +404,23 @@ def run_test_and_summarize_results( if not priority_tests and not default_list and not distributed_list and not inductor_list: # run entire tests for default, distributed and inductor workflows → use test.sh if not test_config: +<<<<<<< HEAD +======= + check_num_gpus_for_distributed() +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # default test process res_default_all = run_entire_tests("default", test_shell_path, overall_logs_path_current_run, test_reports_src) res_all_tests_dict["default"] = res_default_all # distributed test process +<<<<<<< HEAD res_distributed_all = {} if is_multi_gpus_available_for_distributed(): res_distributed_all = run_entire_tests("distributed", test_shell_path, overall_logs_path_current_run, test_reports_src) else: print("Warning: Cannot run distributed unit tests. Number of visible GPUs should be >1 to run distributed unit tests.") +======= + res_distributed_all = run_entire_tests("distributed", test_shell_path, overall_logs_path_current_run, test_reports_src) +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) res_all_tests_dict["distributed"] = res_distributed_all # inductor test process res_inductor_all = run_entire_tests("inductor", test_shell_path, overall_logs_path_current_run, test_reports_src) @@ -414,11 +433,15 @@ def run_test_and_summarize_results( res_default_all = run_entire_tests("default", test_shell_path, overall_logs_path_current_run, test_reports_src) res_all_tests_dict["default"] = res_default_all if "distributed" in workflow_list: +<<<<<<< HEAD res_distributed_all = {} if is_multi_gpus_available_for_distributed(): res_distributed_all = run_entire_tests("distributed", test_shell_path, overall_logs_path_current_run, test_reports_src) else: print("Warning: Cannot run distributed unit tests. Number of visible GPUs should be >1 to run distributed unit tests.") +======= + res_distributed_all = run_entire_tests("distributed", test_shell_path, overall_logs_path_current_run, test_reports_src) +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) res_all_tests_dict["distributed"] = res_distributed_all if "inductor" in workflow_list: res_inductor_all = run_entire_tests("inductor", test_shell_path, overall_logs_path_current_run, test_reports_src) @@ -426,15 +449,23 @@ def run_test_and_summarize_results( # Run priority test for each workflow elif priority_tests and not default_list and not distributed_list and not inductor_list: if not test_config: +<<<<<<< HEAD +======= + check_num_gpus_for_distributed() +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # default test process res_default_priority = run_priority_tests("default", test_run_test_path, overall_logs_path_current_run, test_reports_src) res_all_tests_dict["default"] = res_default_priority # distributed test process +<<<<<<< HEAD res_distributed_priority = {} if is_multi_gpus_available_for_distributed(): res_distributed_priority = run_priority_tests("distributed", test_run_test_path, overall_logs_path_current_run, test_reports_src) else: print("Warning: Cannot run distributed unit tests. Number of visible GPUs should be >1 to run distributed unit tests.") +======= + res_distributed_priority = run_priority_tests("distributed", test_run_test_path, overall_logs_path_current_run, test_reports_src) +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) res_all_tests_dict["distributed"] = res_distributed_priority # will not run inductor priority tests print("Inductor priority tests cannot run since no core tests defined with inductor workflow.") @@ -446,11 +477,15 @@ def run_test_and_summarize_results( res_default_priority = run_priority_tests("default", test_run_test_path, overall_logs_path_current_run, test_reports_src) res_all_tests_dict["default"] = res_default_priority if "distributed" in workflow_list: +<<<<<<< HEAD res_distributed_priority = {} if is_multi_gpus_available_for_distributed(): res_distributed_priority = run_priority_tests("distributed", test_run_test_path, overall_logs_path_current_run, test_reports_src) else: print("Warning: Cannot run distributed unit tests. Number of visible GPUs should be >1 to run distributed unit tests.") +======= + res_distributed_priority = run_priority_tests("distributed", test_run_test_path, overall_logs_path_current_run, test_reports_src) +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) res_all_tests_dict["distributed"] = res_distributed_priority if "inductor" in workflow_list: print("Inductor priority tests cannot run since no core tests defined with inductor workflow.") @@ -466,11 +501,15 @@ def run_test_and_summarize_results( distributed_workflow_list = [] for item in distributed_list: distributed_workflow_list.append(item) +<<<<<<< HEAD res_distributed_selected = {} if is_multi_gpus_available_for_distributed(): res_distributed_selected = run_selected_tests("distributed", test_run_test_path, overall_logs_path_current_run, test_reports_src, distributed_workflow_list) else: print("Warning: Cannot run distributed unit tests. Number of visible GPUs should be >1 to run distributed unit tests.") +======= + res_distributed_selected = run_selected_tests("distributed", test_run_test_path, overall_logs_path_current_run, test_reports_src, distributed_workflow_list) +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) res_all_tests_dict["distributed"] = res_distributed_selected if inductor_list: inductor_workflow_list = [] @@ -518,10 +557,17 @@ def parse_args(): "RUN SELECTED TESTS: python3 run_pytorch_unit_tests.py --default_list test_weak test_dlpack --inductor_list inductor/test_torchinductor") return parser.parse_args() +<<<<<<< HEAD def is_multi_gpus_available_for_distributed(): p = subprocess.run("rocminfo | grep -cE 'Name:\\s+gfx'", shell=True, capture_output=True, text=True) num_gpus_visible = int(p.stdout) return num_gpus_visible > 1 +======= +def check_num_gpus_for_distributed(): + p = subprocess.run("rocminfo | grep -cE 'Name:\s+gfx'", shell=True, capture_output=True, text=True) + num_gpus_visible = int(p.stdout) + assert num_gpus_visible > 1, "Number of visible GPUs should be >1 to run distributed unit tests" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) def main(): args = parse_args() diff --git a/.ci/aarch64_linux/aarch64_ci_build.sh b/.ci/aarch64_linux/aarch64_ci_build.sh index ff3337e3f6d8..5371e4eab362 100644 --- a/.ci/aarch64_linux/aarch64_ci_build.sh +++ b/.ci/aarch64_linux/aarch64_ci_build.sh @@ -3,10 +3,15 @@ set -eux -o pipefail GPU_ARCH_VERSION=${GPU_ARCH_VERSION:-} +<<<<<<< HEAD if [[ "$GPU_ARCH_VERSION" == *"12.6"* ]]; then export TORCH_CUDA_ARCH_LIST="9.0" elif [[ "$GPU_ARCH_VERSION" == *"12.8"* ]]; then export TORCH_CUDA_ARCH_LIST="9.0;10.0;12.0" +======= +if [[ "$GPU_ARCH_VERSION" == *"12.9"* ]]; then + export TORCH_CUDA_ARCH_LIST="8.0;9.0;10.0;12.0" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) fi SCRIPTPATH="$( cd -- "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P )" @@ -27,6 +32,10 @@ if [ "$DESIRED_CUDA" = "cpu" ]; then USE_PRIORITIZED_TEXT_FOR_LD=1 python /pytorch/.ci/aarch64_linux/aarch64_wheel_ci_build.py --enable-mkldnn else echo "BASE_CUDA_VERSION is set to: $DESIRED_CUDA" +<<<<<<< HEAD +======= + export USE_SYSTEM_NCCL=1 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #USE_PRIORITIZED_TEXT_FOR_LD for enable linker script optimization https://github.com/pytorch/pytorch/pull/121975/files USE_PRIORITIZED_TEXT_FOR_LD=1 python /pytorch/.ci/aarch64_linux/aarch64_wheel_ci_build.py --enable-mkldnn --enable-cuda fi diff --git a/.ci/aarch64_linux/aarch64_wheel_ci_build.py b/.ci/aarch64_linux/aarch64_wheel_ci_build.py index a182f4d36bd9..e0ea94032e13 100755 --- a/.ci/aarch64_linux/aarch64_wheel_ci_build.py +++ b/.ci/aarch64_linux/aarch64_wheel_ci_build.py @@ -31,6 +31,7 @@ def build_ArmComputeLibrary() -> None: "build=native", ] acl_install_dir = "/acl" +<<<<<<< HEAD acl_checkout_dir = "ComputeLibrary" os.makedirs(acl_install_dir) check_call( @@ -52,6 +53,30 @@ def build_ArmComputeLibrary() -> None: cwd=acl_checkout_dir, ) for d in ["arm_compute", "include", "utils", "support", "src"]: +======= + acl_checkout_dir = os.getenv("ACL_SOURCE_DIR", "ComputeLibrary") + if os.path.isdir(acl_install_dir): + shutil.rmtree(acl_install_dir) + if not os.path.isdir(acl_checkout_dir) or not len(os.listdir(acl_checkout_dir)): + check_call( + [ + "git", + "clone", + "https://github.com/ARM-software/ComputeLibrary.git", + "-b", + "v25.02", + "--depth", + "1", + "--shallow-submodules", + ] + ) + + check_call( + ["scons", "Werror=1", f"-j{os.cpu_count()}"] + acl_build_flags, + cwd=acl_checkout_dir, + ) + for d in ["arm_compute", "include", "utils", "support", "src", "build"]: +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) shutil.copytree(f"{acl_checkout_dir}/{d}", f"{acl_install_dir}/{d}") @@ -87,7 +112,11 @@ def package_cuda_wheel(wheel_path, desired_cuda) -> None: "/usr/local/cuda/lib64/libcusparseLt.so.0", "/usr/local/cuda/lib64/libcusolver.so.11", "/usr/local/cuda/lib64/libcurand.so.10", +<<<<<<< HEAD "/usr/local/cuda/lib64/libnvToolsExt.so.1", +======= + "/usr/local/cuda/lib64/libnccl.so.2", +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) "/usr/local/cuda/lib64/libnvJitLink.so.12", "/usr/local/cuda/lib64/libnvrtc.so.12", "/usr/local/cuda/lib64/libcudnn_adv.so.9", @@ -107,9 +136,15 @@ def package_cuda_wheel(wheel_path, desired_cuda) -> None: "/usr/local/lib/libnvpl_blas_core.so.0", ] +<<<<<<< HEAD if "128" in desired_cuda: libs_to_copy += [ "/usr/local/cuda/lib64/libnvrtc-builtins.so.12.8", +======= + if "129" in desired_cuda: + libs_to_copy += [ + "/usr/local/cuda/lib64/libnvrtc-builtins.so.12.9", +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) "/usr/local/cuda/lib64/libcufile.so.0", "/usr/local/cuda/lib64/libcufile_rdma.so.1", ] @@ -203,8 +238,15 @@ def parse_arguments(): ).decode() print("Building PyTorch wheel") +<<<<<<< HEAD build_vars = "MAX_JOBS=5 CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000 " os.system("cd /pytorch; python setup.py clean") +======= + build_vars = "CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000 " + # MAX_JOB=5 is not required for CPU backend (see commit 465d98b) + if enable_cuda: + build_vars = "MAX_JOBS=5 " + build_vars +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) override_package_version = os.getenv("OVERRIDE_PACKAGE_VERSION") desired_cuda = os.getenv("DESIRED_CUDA") diff --git a/.ci/aarch64_linux/build_aarch64_wheel.py b/.ci/aarch64_linux/build_aarch64_wheel.py index c6593a179cfa..02b5c4a5fcad 100755 --- a/.ci/aarch64_linux/build_aarch64_wheel.py +++ b/.ci/aarch64_linux/build_aarch64_wheel.py @@ -19,13 +19,19 @@ # AMI images for us-east-1, change the following based on your ~/.aws/config os_amis = { +<<<<<<< HEAD "ubuntu18_04": "ami-078eece1d8119409f", # login_name: ubuntu +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) "ubuntu20_04": "ami-052eac90edaa9d08f", # login_name: ubuntu "ubuntu22_04": "ami-0c6c29c5125214c77", # login_name: ubuntu "redhat8": "ami-0698b90665a2ddcf1", # login_name: ec2-user } +<<<<<<< HEAD ubuntu18_04_ami = os_amis["ubuntu18_04"] +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ubuntu20_04_ami = os_amis["ubuntu20_04"] @@ -659,6 +665,7 @@ def configure_system( "sudo apt-get install -y python3-dev python3-yaml python3-setuptools python3-wheel python3-pip" ) host.run_cmd("pip3 install dataclasses typing-extensions") +<<<<<<< HEAD # Install and switch to gcc-8 on Ubuntu-18.04 if not host.using_docker() and host.ami == ubuntu18_04_ami and compiler == "gcc-8": host.run_cmd("sudo apt-get install -y g++-8 gfortran-8") @@ -671,6 +678,8 @@ def configure_system( host.run_cmd( "sudo update-alternatives --install /usr/bin/gfortran gfortran /usr/bin/gfortran-8 100" ) +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if not use_conda: print("Installing Cython + numpy from PyPy") host.run_cmd("sudo pip3 install Cython") @@ -1026,7 +1035,11 @@ def parse_arguments(): install_condaforge_python(host, args.python_version) sys.exit(0) +<<<<<<< HEAD python_version = args.python_version if args.python_version is not None else "3.8" +======= + python_version = args.python_version if args.python_version is not None else "3.9" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if args.use_torch_from_pypi: configure_system(host, compiler=args.compiler, python_version=python_version) diff --git a/.ci/caffe2/README.md b/.ci/caffe2/README.md index c22cd8f228a3..4be1122ab164 100644 --- a/.ci/caffe2/README.md +++ b/.ci/caffe2/README.md @@ -10,5 +10,8 @@ example: `py2-cuda9.0-cudnn7-ubuntu16.04`. The Docker images that are built on Jenkins and are used in triggered builds already have this environment variable set in their manifest. Also see `./docker/jenkins/*/Dockerfile` and search for `BUILD_ENVIRONMENT`. +<<<<<<< HEAD Our Jenkins installation is located at https://ci.pytorch.org/jenkins/. +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) diff --git a/.ci/caffe2/test.sh b/.ci/caffe2/test.sh index a8adfc1fa0c7..1d6fd150e7be 100755 --- a/.ci/caffe2/test.sh +++ b/.ci/caffe2/test.sh @@ -13,10 +13,13 @@ if [[ "${BUILD_ENVIRONMENT}" == *-android* ]]; then echo 'Skipping tests' exit 0 fi +<<<<<<< HEAD if [[ "${BUILD_ENVIRONMENT}" == *-rocm* ]]; then # temporary to locate some kernel issues on the CI nodes export HSAKMT_DEBUG_LEVEL=4 fi +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # These additional packages are needed for circleci ROCm builds. if [[ $BUILD_ENVIRONMENT == *rocm* ]]; then # Need networkx 2.0 because bellmand_ford was moved in 2.1 . Scikit-image by diff --git a/.ci/docker/README.md b/.ci/docker/README.md index 68df30763151..d9b2d8e1b4d5 100644 --- a/.ci/docker/README.md +++ b/.ci/docker/README.md @@ -34,5 +34,9 @@ See `build.sh` for valid build environments (it's the giant switch). ./build.sh pytorch-linux-bionic-py3.8-gcc9 -t myimage:latest # Set flags (see build.sh) and build image +<<<<<<< HEAD sudo bash -c 'PROTOBUF=1 ./build.sh pytorch-linux-bionic-py3.8-gcc9 -t myimage:latest +======= +sudo bash -c 'TRITON=1 ./build.sh pytorch-linux-bionic-py3.8-gcc9 -t myimage:latest +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ``` diff --git a/.ci/docker/almalinux/Dockerfile b/.ci/docker/almalinux/Dockerfile index 5f17a6332dd1..86b50d5f1a1a 100644 --- a/.ci/docker/almalinux/Dockerfile +++ b/.ci/docker/almalinux/Dockerfile @@ -1,6 +1,13 @@ +<<<<<<< HEAD ARG CUDA_VERSION=12.4 ARG BASE_TARGET=cuda${CUDA_VERSION} FROM amd64/almalinux:8 as base +======= +ARG CUDA_VERSION=12.6 +ARG BASE_TARGET=cuda${CUDA_VERSION} +ARG ROCM_IMAGE=rocm/dev-almalinux-8:6.3-complete +FROM amd64/almalinux:8.10-20250519 as base +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ENV LC_ALL en_US.UTF-8 ENV LANG en_US.UTF-8 @@ -8,12 +15,19 @@ ENV LANGUAGE en_US.UTF-8 ARG DEVTOOLSET_VERSION=11 +<<<<<<< HEAD ENV LC_ALL en_US.UTF-8 ENV LANG en_US.UTF-8 ENV LANGUAGE en_US.UTF-8 RUN yum -y update RUN yum -y install epel-release +======= +RUN yum -y update +RUN yum -y install epel-release +# install glibc-langpack-en make sure en_US.UTF-8 locale is available +RUN yum -y install glibc-langpack-en +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) RUN yum install -y sudo wget curl perl util-linux xz bzip2 git patch which perl zlib-devel openssl-devel yum-utils autoconf automake make gcc-toolset-${DEVTOOLSET_VERSION}-toolchain # Just add everything as a safe.directory for git since these will be used in multiple places with git RUN git config --global --add safe.directory '*' @@ -41,15 +55,25 @@ RUN bash ./install_conda.sh && rm install_conda.sh # Install CUDA FROM base as cuda +<<<<<<< HEAD ARG CUDA_VERSION=12.4 RUN rm -rf /usr/local/cuda-* ADD ./common/install_cuda.sh install_cuda.sh +======= +ARG CUDA_VERSION=12.6 +RUN rm -rf /usr/local/cuda-* +ADD ./common/install_cuda.sh install_cuda.sh +COPY ./common/install_nccl.sh install_nccl.sh +COPY ./ci_commit_pins/nccl-cu* /ci_commit_pins/ +COPY ./common/install_cusparselt.sh install_cusparselt.sh +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ENV CUDA_HOME=/usr/local/cuda-${CUDA_VERSION} # Preserve CUDA_VERSION for the builds ENV CUDA_VERSION=${CUDA_VERSION} # Make things in our path by default ENV PATH=/usr/local/cuda-${CUDA_VERSION}/bin:$PATH +<<<<<<< HEAD FROM cuda as cuda11.8 RUN bash ./install_cuda.sh 11.8 ENV DESIRED_CUDA=11.8 @@ -62,10 +86,29 @@ FROM cuda as cuda12.4 RUN bash ./install_cuda.sh 12.4 ENV DESIRED_CUDA=12.4 +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) FROM cuda as cuda12.6 RUN bash ./install_cuda.sh 12.6 ENV DESIRED_CUDA=12.6 +<<<<<<< HEAD +======= +FROM cuda as cuda12.8 +RUN bash ./install_cuda.sh 12.8 +ENV DESIRED_CUDA=12.8 + +FROM cuda as cuda12.9 +RUN bash ./install_cuda.sh 12.9 +ENV DESIRED_CUDA=12.9 + +FROM ${ROCM_IMAGE} as rocm +ENV PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201" +ADD ./common/install_mkl.sh install_mkl.sh +RUN bash ./install_mkl.sh && rm install_mkl.sh +ENV MKLROOT /opt/intel + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # Install MNIST test data FROM base as mnist ADD ./common/install_mnist.sh install_mnist.sh @@ -73,9 +116,15 @@ RUN bash ./install_mnist.sh FROM base as all_cuda COPY --from=cuda11.8 /usr/local/cuda-11.8 /usr/local/cuda-11.8 +<<<<<<< HEAD COPY --from=cuda12.1 /usr/local/cuda-12.1 /usr/local/cuda-12.1 COPY --from=cuda12.4 /usr/local/cuda-12.4 /usr/local/cuda-12.4 COPY --from=cuda12.6 /usr/local/cuda-12.6 /usr/local/cuda-12.6 +======= +COPY --from=cuda12.6 /usr/local/cuda-12.6 /usr/local/cuda-12.6 +COPY --from=cuda12.8 /usr/local/cuda-12.8 /usr/local/cuda-12.8 +COPY --from=cuda12.9 /usr/local/cuda-12.9 /usr/local/cuda-12.9 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # Final step FROM ${BASE_TARGET} as final diff --git a/.ci/docker/almalinux/build.sh b/.ci/docker/almalinux/build.sh index cf81bdf4aea0..7c4f639b5b30 100755 --- a/.ci/docker/almalinux/build.sh +++ b/.ci/docker/almalinux/build.sh @@ -1,12 +1,17 @@ #!/usr/bin/env bash # Script used only in CD pipeline +<<<<<<< HEAD set -eou pipefail +======= +set -exou pipefail +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) image="$1" shift if [ -z "${image}" ]; then +<<<<<<< HEAD echo "Usage: $0 IMAGE" exit 1 fi @@ -79,4 +84,66 @@ if [[ "${WITH_PUSH:-}" == true ]]; then docker push "${DOCKER_IMAGE_SHA_TAG}" fi ) +======= + echo "Usage: $0 IMAGENAME:ARCHTAG" + exit 1 +fi + +# Go from imagename:tag to tag +DOCKER_TAG_PREFIX=$(echo "${image}" | awk -F':' '{print $2}') + +CUDA_VERSION="" +ROCM_VERSION="" +EXTRA_BUILD_ARGS="" +if [[ "${DOCKER_TAG_PREFIX}" == cuda* ]]; then + # extract cuda version from image name and tag. e.g. manylinux2_28-builder:cuda12.8 returns 12.8 + CUDA_VERSION=$(echo "${DOCKER_TAG_PREFIX}" | awk -F'cuda' '{print $2}') + EXTRA_BUILD_ARGS="--build-arg CUDA_VERSION=${CUDA_VERSION}" +elif [[ "${DOCKER_TAG_PREFIX}" == rocm* ]]; then + # extract rocm version from image name and tag. e.g. manylinux2_28-builder:rocm6.2.4 returns 6.2.4 + ROCM_VERSION=$(echo "${DOCKER_TAG_PREFIX}" | awk -F'rocm' '{print $2}') + EXTRA_BUILD_ARGS="--build-arg ROCM_IMAGE=rocm/dev-almalinux-8:${ROCM_VERSION}-complete" +fi + +case ${DOCKER_TAG_PREFIX} in + cpu) + BASE_TARGET=base + ;; + cuda*) + BASE_TARGET=cuda${CUDA_VERSION} + ;; + rocm*) + BASE_TARGET=rocm + ;; + *) + echo "ERROR: Unknown docker tag ${DOCKER_TAG_PREFIX}" + exit 1 + ;; +esac + +# TODO: Remove LimitNOFILE=1048576 patch once https://github.com/pytorch/test-infra/issues/5712 +# is resolved. This patch is required in order to fix timing out of Docker build on Amazon Linux 2023. +sudo sed -i s/LimitNOFILE=infinity/LimitNOFILE=1048576/ /usr/lib/systemd/system/docker.service +sudo systemctl daemon-reload +sudo systemctl restart docker + +export DOCKER_BUILDKIT=1 +TOPDIR=$(git rev-parse --show-toplevel) +tmp_tag=$(basename "$(mktemp -u)" | tr '[:upper:]' '[:lower:]') + +docker build \ + --target final \ + --progress plain \ + --build-arg "BASE_TARGET=${BASE_TARGET}" \ + --build-arg "DEVTOOLSET_VERSION=11" \ + ${EXTRA_BUILD_ARGS} \ + -t ${tmp_tag} \ + $@ \ + -f "${TOPDIR}/.ci/docker/almalinux/Dockerfile" \ + ${TOPDIR}/.ci/docker/ + +if [ -n "${CUDA_VERSION}" ]; then + # Test that we're using the right CUDA compiler + docker run --rm "${tmp_tag}" nvcc --version | grep "cuda_${CUDA_VERSION}" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) fi diff --git a/.ci/docker/build.sh b/.ci/docker/build.sh index a7594b0f82b3..ac52ab2567b4 100755 --- a/.ci/docker/build.sh +++ b/.ci/docker/build.sh @@ -50,32 +50,46 @@ if [[ "$image" == *xla* ]]; then exit 0 fi +<<<<<<< HEAD if [[ "$image" == *-focal* ]]; then UBUNTU_VERSION=20.04 elif [[ "$image" == *-jammy* ]]; then +======= +if [[ "$image" == *-jammy* ]]; then +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) UBUNTU_VERSION=22.04 elif [[ "$image" == *-noble* ]]; then UBUNTU_VERSION=24.04 elif [[ "$image" == *ubuntu* ]]; then extract_version_from_image_name ubuntu UBUNTU_VERSION +<<<<<<< HEAD elif [[ "$image" == *centos* ]]; then extract_version_from_image_name centos CENTOS_VERSION +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) fi if [ -n "${UBUNTU_VERSION}" ]; then OS="ubuntu" +<<<<<<< HEAD elif [ -n "${CENTOS_VERSION}" ]; then OS="centos" +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) else echo "Unable to derive operating system base..." exit 1 fi DOCKERFILE="${OS}/Dockerfile" +<<<<<<< HEAD # When using ubuntu - 22.04, start from Ubuntu docker image, instead of nvidia/cuda docker image. if [[ "$image" == *cuda* && "$UBUNTU_VERSION" != "22.04" ]]; then DOCKERFILE="${OS}-cuda/Dockerfile" elif [[ "$image" == *rocm* ]]; then +======= +if [[ "$image" == *rocm* ]]; then +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) DOCKERFILE="${OS}-rocm/Dockerfile" elif [[ "$image" == *xpu* ]]; then DOCKERFILE="${OS}-xpu/Dockerfile" @@ -87,9 +101,12 @@ elif [[ "$image" == *linter* ]]; then DOCKERFILE="linter/Dockerfile" fi +<<<<<<< HEAD # CMake 3.18 is needed to support CUDA17 language variant CMAKE_VERSION=3.18.5 +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) _UCX_COMMIT=7bb2722ff2187a0cad557ae4a6afa090569f83fb _UCC_COMMIT=20eae37090a4ce1b32bcce6144ccad0b49943e0b if [[ "$image" == *rocm* ]]; then @@ -97,6 +114,7 @@ if [[ "$image" == *rocm* ]]; then _UCC_COMMIT=0c0fc21559835044ab107199e334f7157d6a0d3d fi +<<<<<<< HEAD # It's annoying to rename jobs every time you want to rewrite a # configuration, so we hardcode everything here rather than do it # from scratch @@ -108,10 +126,24 @@ case "$image" in GCC_VERSION=11 PROTOBUF=yes DB=yes +======= +tag=$(echo $image | awk -F':' '{print $2}') + +# It's annoying to rename jobs every time you want to rewrite a +# configuration, so we hardcode everything here rather than do it +# from scratch +case "$tag" in + pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11) + CUDA_VERSION=12.8.1 + CUDNN_VERSION=9 + ANACONDA_PYTHON_VERSION=3.10 + GCC_VERSION=11 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) VISION=yes KATEX=yes UCX_COMMIT=${_UCX_COMMIT} UCC_COMMIT=${_UCC_COMMIT} +<<<<<<< HEAD CONDA_CMAKE=yes TRITON=yes ;; @@ -122,10 +154,20 @@ case "$image" in GCC_VERSION=9 PROTOBUF=yes DB=yes +======= + TRITON=yes + ;; + pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks) + CUDA_VERSION=12.8.1 + CUDNN_VERSION=9 + ANACONDA_PYTHON_VERSION=3.10 + GCC_VERSION=9 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) VISION=yes KATEX=yes UCX_COMMIT=${_UCX_COMMIT} UCC_COMMIT=${_UCC_COMMIT} +<<<<<<< HEAD CONDA_CMAKE=yes TRITON=yes INDUCTOR_BENCHMARKS=yes @@ -137,10 +179,21 @@ case "$image" in GCC_VERSION=9 PROTOBUF=yes DB=yes +======= + TRITON=yes + INDUCTOR_BENCHMARKS=yes + ;; + pytorch-linux-jammy-cuda12.8-cudnn9-py3.12-gcc9-inductor-benchmarks) + CUDA_VERSION=12.8.1 + CUDNN_VERSION=9 + ANACONDA_PYTHON_VERSION=3.12 + GCC_VERSION=9 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) VISION=yes KATEX=yes UCX_COMMIT=${_UCX_COMMIT} UCC_COMMIT=${_UCC_COMMIT} +<<<<<<< HEAD CONDA_CMAKE=yes TRITON=yes INDUCTOR_BENCHMARKS=yes @@ -152,25 +205,46 @@ case "$image" in GCC_VERSION=9 PROTOBUF=yes DB=yes +======= + TRITON=yes + INDUCTOR_BENCHMARKS=yes + ;; + pytorch-linux-jammy-cuda12.8-cudnn9-py3.13-gcc9-inductor-benchmarks) + CUDA_VERSION=12.8.1 + CUDNN_VERSION=9 + ANACONDA_PYTHON_VERSION=3.13 + GCC_VERSION=9 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) VISION=yes KATEX=yes UCX_COMMIT=${_UCX_COMMIT} UCC_COMMIT=${_UCC_COMMIT} +<<<<<<< HEAD CONDA_CMAKE=yes TRITON=yes INDUCTOR_BENCHMARKS=yes ;; pytorch-linux-focal-cuda12.6-cudnn9-py3-gcc9) +======= + TRITON=yes + INDUCTOR_BENCHMARKS=yes + ;; + pytorch-linux-jammy-cuda12.6-cudnn9-py3-gcc9) +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) CUDA_VERSION=12.6.3 CUDNN_VERSION=9 ANACONDA_PYTHON_VERSION=3.10 GCC_VERSION=9 +<<<<<<< HEAD PROTOBUF=yes DB=yes +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) VISION=yes KATEX=yes UCX_COMMIT=${_UCX_COMMIT} UCC_COMMIT=${_UCC_COMMIT} +<<<<<<< HEAD CONDA_CMAKE=yes TRITON=yes ;; @@ -181,10 +255,20 @@ case "$image" in GCC_VERSION=9 PROTOBUF=yes DB=yes +======= + TRITON=yes + ;; + pytorch-linux-jammy-cuda12.6-cudnn9-py3-gcc9-inductor-benchmarks) + CUDA_VERSION=12.6 + CUDNN_VERSION=9 + ANACONDA_PYTHON_VERSION=3.10 + GCC_VERSION=9 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) VISION=yes KATEX=yes UCX_COMMIT=${_UCX_COMMIT} UCC_COMMIT=${_UCC_COMMIT} +<<<<<<< HEAD CONDA_CMAKE=yes TRITON=yes INDUCTOR_BENCHMARKS=yes @@ -196,10 +280,21 @@ case "$image" in GCC_VERSION=9 PROTOBUF=yes DB=yes +======= + TRITON=yes + INDUCTOR_BENCHMARKS=yes + ;; + pytorch-linux-jammy-cuda12.6-cudnn9-py3.12-gcc9-inductor-benchmarks) + CUDA_VERSION=12.6 + CUDNN_VERSION=9 + ANACONDA_PYTHON_VERSION=3.12 + GCC_VERSION=9 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) VISION=yes KATEX=yes UCX_COMMIT=${_UCX_COMMIT} UCC_COMMIT=${_UCC_COMMIT} +<<<<<<< HEAD CONDA_CMAKE=yes TRITON=yes INDUCTOR_BENCHMARKS=yes @@ -211,10 +306,21 @@ case "$image" in GCC_VERSION=9 PROTOBUF=yes DB=yes +======= + TRITON=yes + INDUCTOR_BENCHMARKS=yes + ;; + pytorch-linux-jammy-cuda12.6-cudnn9-py3.13-gcc9-inductor-benchmarks) + CUDA_VERSION=12.6 + CUDNN_VERSION=9 + ANACONDA_PYTHON_VERSION=3.13 + GCC_VERSION=9 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) VISION=yes KATEX=yes UCX_COMMIT=${_UCX_COMMIT} UCC_COMMIT=${_UCC_COMMIT} +<<<<<<< HEAD CONDA_CMAKE=yes TRITON=yes INDUCTOR_BENCHMARKS=yes @@ -226,10 +332,21 @@ case "$image" in GCC_VERSION=9 PROTOBUF=yes DB=yes +======= + TRITON=yes + INDUCTOR_BENCHMARKS=yes + ;; + pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9) + CUDA_VERSION=12.8.1 + CUDNN_VERSION=9 + ANACONDA_PYTHON_VERSION=3.10 + GCC_VERSION=9 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) VISION=yes KATEX=yes UCX_COMMIT=${_UCX_COMMIT} UCC_COMMIT=${_UCC_COMMIT} +<<<<<<< HEAD CONDA_CMAKE=yes TRITON=yes ;; @@ -297,12 +414,47 @@ case "$image" in ROCM_VERSION=6.3 NINJA_VERSION=1.9.0 CONDA_CMAKE=yes +======= + TRITON=yes + ;; + pytorch-linux-jammy-py3-clang12-onnx) + ANACONDA_PYTHON_VERSION=3.9 + CLANG_VERSION=12 + VISION=yes + ONNX=yes + ;; + pytorch-linux-jammy-py3.9-clang12) + ANACONDA_PYTHON_VERSION=3.9 + CLANG_VERSION=12 + VISION=yes + TRITON=yes + ;; + pytorch-linux-jammy-py3.11-clang12) + ANACONDA_PYTHON_VERSION=3.11 + CLANG_VERSION=12 + VISION=yes + TRITON=yes + ;; + pytorch-linux-jammy-py3.9-gcc9) + ANACONDA_PYTHON_VERSION=3.9 + GCC_VERSION=9 + VISION=yes + TRITON=yes + ;; + pytorch-linux-jammy-rocm-n-1-py3) + ANACONDA_PYTHON_VERSION=3.10 + GCC_VERSION=11 + VISION=yes + ROCM_VERSION=6.3 + NINJA_VERSION=1.9.0 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) TRITON=yes KATEX=yes UCX_COMMIT=${_UCX_COMMIT} UCC_COMMIT=${_UCC_COMMIT} INDUCTOR_BENCHMARKS=yes ;; +<<<<<<< HEAD pytorch-linux-jammy-xpu-2024.0-py3) ANACONDA_PYTHON_VERSION=3.9 GCC_VERSION=11 @@ -313,30 +465,63 @@ case "$image" in NINJA_VERSION=1.9.0 CONDA_CMAKE=yes TRITON=yes +======= + pytorch-linux-jammy-rocm-n-py3) + ANACONDA_PYTHON_VERSION=3.10 + GCC_VERSION=11 + VISION=yes + ROCM_VERSION=6.4 + NINJA_VERSION=1.9.0 + TRITON=yes + KATEX=yes + UCX_COMMIT=${_UCX_COMMIT} + UCC_COMMIT=${_UCC_COMMIT} + INDUCTOR_BENCHMARKS=yes +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ;; pytorch-linux-jammy-xpu-2025.0-py3) ANACONDA_PYTHON_VERSION=3.9 GCC_VERSION=11 +<<<<<<< HEAD PROTOBUF=yes DB=yes VISION=yes XPU_VERSION=2025.0 NINJA_VERSION=1.9.0 CONDA_CMAKE=yes +======= + VISION=yes + XPU_VERSION=2025.0 + NINJA_VERSION=1.9.0 + TRITON=yes + ;; + pytorch-linux-jammy-xpu-2025.1-py3) + ANACONDA_PYTHON_VERSION=3.9 + GCC_VERSION=11 + VISION=yes + XPU_VERSION=2025.1 + NINJA_VERSION=1.9.0 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) TRITON=yes ;; pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks) ANACONDA_PYTHON_VERSION=3.9 GCC_VERSION=11 +<<<<<<< HEAD PROTOBUF=yes DB=yes VISION=yes KATEX=yes CONDA_CMAKE=yes +======= + VISION=yes + KATEX=yes +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) TRITON=yes DOCS=yes INDUCTOR_BENCHMARKS=yes ;; +<<<<<<< HEAD pytorch-linux-jammy-cuda11.8-cudnn9-py3.9-clang12) ANACONDA_PYTHON_VERSION=3.9 CUDA_VERSION=11.8 @@ -344,38 +529,60 @@ case "$image" in CLANG_VERSION=12 PROTOBUF=yes DB=yes +======= + pytorch-linux-jammy-cuda12.8-cudnn9-py3.9-clang12) + ANACONDA_PYTHON_VERSION=3.9 + CUDA_VERSION=12.8.1 + CUDNN_VERSION=9 + CLANG_VERSION=12 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) VISION=yes TRITON=yes ;; pytorch-linux-jammy-py3-clang12-asan) ANACONDA_PYTHON_VERSION=3.9 CLANG_VERSION=12 +<<<<<<< HEAD PROTOBUF=yes DB=yes VISION=yes CONDA_CMAKE=yes +======= + VISION=yes +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) TRITON=yes ;; pytorch-linux-jammy-py3-clang15-asan) ANACONDA_PYTHON_VERSION=3.10 CLANG_VERSION=15 +<<<<<<< HEAD CONDA_CMAKE=yes +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) VISION=yes ;; pytorch-linux-jammy-py3-clang18-asan) ANACONDA_PYTHON_VERSION=3.10 CLANG_VERSION=18 +<<<<<<< HEAD CONDA_CMAKE=yes +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) VISION=yes ;; pytorch-linux-jammy-py3.9-gcc11) ANACONDA_PYTHON_VERSION=3.9 GCC_VERSION=11 +<<<<<<< HEAD PROTOBUF=yes DB=yes VISION=yes KATEX=yes CONDA_CMAKE=yes +======= + VISION=yes + KATEX=yes +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) TRITON=yes DOCS=yes UNINSTALL_DILL=yes @@ -383,14 +590,20 @@ case "$image" in pytorch-linux-jammy-py3-clang12-executorch) ANACONDA_PYTHON_VERSION=3.10 CLANG_VERSION=12 +<<<<<<< HEAD CONDA_CMAKE=yes +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) EXECUTORCH=yes ;; pytorch-linux-jammy-py3.12-halide) CUDA_VERSION=12.6 ANACONDA_PYTHON_VERSION=3.12 GCC_VERSION=11 +<<<<<<< HEAD CONDA_CMAKE=yes +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) HALIDE=yes TRITON=yes ;; @@ -398,6 +611,7 @@ case "$image" in CUDA_VERSION=12.6 ANACONDA_PYTHON_VERSION=3.12 GCC_VERSION=11 +<<<<<<< HEAD CONDA_CMAKE=yes TRITON_CPU=yes ;; @@ -412,15 +626,34 @@ case "$image" in ANACONDA_PYTHON_VERSION=3.9 CUDA_VERSION=11.8 CONDA_CMAKE=yes +======= + TRITON_CPU=yes + ;; + pytorch-linux-jammy-linter) + # TODO: Use 3.9 here because of this issue https://github.com/python/mypy/issues/13627. + # We will need to update mypy version eventually, but that's for another day. The task + # would be to upgrade mypy to 1.0.0 with Python 3.11 + PYTHON_VERSION=3.9 + ;; + pytorch-linux-jammy-cuda12.8-cudnn9-py3.9-linter) + PYTHON_VERSION=3.9 + CUDA_VERSION=12.8.1 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ;; pytorch-linux-jammy-aarch64-py3.10-gcc11) ANACONDA_PYTHON_VERSION=3.10 GCC_VERSION=11 ACL=yes +<<<<<<< HEAD PROTOBUF=yes DB=yes VISION=yes CONDA_CMAKE=yes +======= + VISION=yes + CONDA_CMAKE=yes + OPENBLAS=yes +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # snadampal: skipping llvm src build install because the current version # from pytorch/llvm:9.0.1 is x86 specific SKIP_LLVM_SRC_BUILD_INSTALL=yes @@ -429,10 +662,16 @@ case "$image" in ANACONDA_PYTHON_VERSION=3.10 GCC_VERSION=11 ACL=yes +<<<<<<< HEAD PROTOBUF=yes DB=yes VISION=yes CONDA_CMAKE=yes +======= + VISION=yes + CONDA_CMAKE=yes + OPENBLAS=yes +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # snadampal: skipping llvm src build install because the current version # from pytorch/llvm:9.0.1 is x86 specific SKIP_LLVM_SRC_BUILD_INSTALL=yes @@ -440,8 +679,11 @@ case "$image" in ;; *) # Catch-all for builds that are not hardcoded. +<<<<<<< HEAD PROTOBUF=yes DB=yes +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) VISION=yes echo "image '$image' did not match an existing build configuration" if [[ "$image" == *py* ]]; then @@ -457,8 +699,12 @@ case "$image" in TRITON=yes # To ensure that any ROCm config will build using conda cmake # and thus have LAPACK/MKL enabled +<<<<<<< HEAD CONDA_CMAKE=yes fi +======= + fi +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if [[ "$image" == *centos7* ]]; then NINJA_VERSION=1.10.2 fi @@ -474,14 +720,18 @@ case "$image" in if [[ "$image" == *glibc* ]]; then extract_version_from_image_name glibc GLIBC_VERSION fi +<<<<<<< HEAD if [[ "$image" == *cmake* ]]; then extract_version_from_image_name cmake CMAKE_VERSION fi +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ;; esac tmp_tag=$(basename "$(mktemp -u)" | tr '[:upper:]' '[:lower:]') +<<<<<<< HEAD #when using cudnn version 8 install it separately from cuda if [[ "$image" == *cuda* && ${OS} == "ubuntu" ]]; then IMAGE_NAME="nvidia/cuda:${CUDA_VERSION}-cudnn${CUDNN_VERSION}-devel-ubuntu${UBUNTU_VERSION}" @@ -493,10 +743,19 @@ fi DOCKER_PROGRESS="--progress=plain" if [[ "${DOCKER_BUILDKIT}" == 0 ]]; then DOCKER_PROGRESS="" +======= +no_cache_flag="" +progress_flag="" +# Do not use cache and progress=plain when in CI +if [[ -n "${CI:-}" ]]; then + no_cache_flag="--no-cache" + progress_flag="--progress=plain" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) fi # Build image docker build \ +<<<<<<< HEAD --no-cache \ ${DOCKER_PROGRESS} \ --build-arg "BUILD_ENVIRONMENT=${image}" \ @@ -506,18 +765,33 @@ docker build \ --build-arg "VISION=${VISION:-}" \ --build-arg "UBUNTU_VERSION=${UBUNTU_VERSION}" \ --build-arg "CENTOS_VERSION=${CENTOS_VERSION}" \ +======= + ${no_cache_flag} \ + ${progress_flag} \ + --build-arg "BUILD_ENVIRONMENT=${image}" \ + --build-arg "LLVMDEV=${LLVMDEV:-}" \ + --build-arg "VISION=${VISION:-}" \ + --build-arg "UBUNTU_VERSION=${UBUNTU_VERSION}" \ +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) --build-arg "DEVTOOLSET_VERSION=${DEVTOOLSET_VERSION}" \ --build-arg "GLIBC_VERSION=${GLIBC_VERSION}" \ --build-arg "CLANG_VERSION=${CLANG_VERSION}" \ --build-arg "ANACONDA_PYTHON_VERSION=${ANACONDA_PYTHON_VERSION}" \ +<<<<<<< HEAD +======= + --build-arg "PYTHON_VERSION=${PYTHON_VERSION}" \ +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) --build-arg "GCC_VERSION=${GCC_VERSION}" \ --build-arg "CUDA_VERSION=${CUDA_VERSION}" \ --build-arg "CUDNN_VERSION=${CUDNN_VERSION}" \ --build-arg "TENSORRT_VERSION=${TENSORRT_VERSION}" \ --build-arg "GRADLE_VERSION=${GRADLE_VERSION}" \ +<<<<<<< HEAD --build-arg "VULKAN_SDK_VERSION=${VULKAN_SDK_VERSION}" \ --build-arg "SWIFTSHADER=${SWIFTSHADER}" \ --build-arg "CMAKE_VERSION=${CMAKE_VERSION:-}" \ +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) --build-arg "NINJA_VERSION=${NINJA_VERSION:-}" \ --build-arg "KATEX=${KATEX:-}" \ --build-arg "ROCM_VERSION=${ROCM_VERSION:-}" \ @@ -525,7 +799,10 @@ docker build \ --build-arg "IMAGE_NAME=${IMAGE_NAME}" \ --build-arg "UCX_COMMIT=${UCX_COMMIT}" \ --build-arg "UCC_COMMIT=${UCC_COMMIT}" \ +<<<<<<< HEAD --build-arg "CONDA_CMAKE=${CONDA_CMAKE}" \ +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) --build-arg "TRITON=${TRITON}" \ --build-arg "TRITON_CPU=${TRITON_CPU}" \ --build-arg "ONNX=${ONNX}" \ @@ -534,7 +811,13 @@ docker build \ --build-arg "EXECUTORCH=${EXECUTORCH}" \ --build-arg "HALIDE=${HALIDE}" \ --build-arg "XPU_VERSION=${XPU_VERSION}" \ +<<<<<<< HEAD + --build-arg "ACL=${ACL:-}" \ +======= + --build-arg "UNINSTALL_DILL=${UNINSTALL_DILL}" \ --build-arg "ACL=${ACL:-}" \ + --build-arg "OPENBLAS=${OPENBLAS:-}" \ +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) --build-arg "SKIP_SCCACHE_INSTALL=${SKIP_SCCACHE_INSTALL:-}" \ --build-arg "SKIP_LLVM_SRC_BUILD_INSTALL=${SKIP_LLVM_SRC_BUILD_INSTALL:-}" \ -f $(dirname ${DOCKERFILE})/Dockerfile \ @@ -551,7 +834,11 @@ docker build \ UBUNTU_VERSION=$(echo ${UBUNTU_VERSION} | sed 's/-rc$//') function drun() { +<<<<<<< HEAD docker run --rm "$tmp_tag" $* +======= + docker run --rm "$tmp_tag" "$@" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } if [[ "$OS" == "ubuntu" ]]; then @@ -599,3 +886,26 @@ if [ -n "$KATEX" ]; then exit 1 fi fi +<<<<<<< HEAD +======= + +HAS_TRITON=$(drun python -c "import triton" > /dev/null 2>&1 && echo "yes" || echo "no") +if [[ -n "$TRITON" || -n "$TRITON_CPU" ]]; then + if [ "$HAS_TRITON" = "no" ]; then + echo "expecting triton to be installed, but it is not" + exit 1 + fi +elif [ "$HAS_TRITON" = "yes" ]; then + echo "expecting triton to not be installed, but it is" + exit 1 +fi + +# Sanity check cmake version. Executorch reinstalls cmake and I'm not sure if +# they support 4.0.0 yet, so exclude them from this check. +CMAKE_VERSION=$(drun cmake --version) +if [[ "$EXECUTORCH" != *yes* && "$CMAKE_VERSION" != *4.* ]]; then + echo "CMake version is not 4.0.0:" + drun cmake --version + exit 1 +fi +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) diff --git a/.ci/docker/centos-rocm/Dockerfile b/.ci/docker/centos-rocm/Dockerfile index e683e587d1eb..a9e567dcd765 100644 --- a/.ci/docker/centos-rocm/Dockerfile +++ b/.ci/docker/centos-rocm/Dockerfile @@ -1,7 +1,13 @@ ARG CENTOS_VERSION +<<<<<<< HEAD FROM quay.io/centos/centos:stream${CENTOS_VERSION} +======= +FROM centos:${CENTOS_VERSION} + +ARG CENTOS_VERSION +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # Set AMD gpu targets to build for ARG PYTORCH_ROCM_ARCH @@ -13,6 +19,7 @@ ENV PYTORCH_ROCM_ARCH ${PYTORCH_ROCM_ARCH} COPY ./common/install_base.sh install_base.sh RUN bash ./install_base.sh && rm install_base.sh +<<<<<<< HEAD #Install langpack RUN yum install -y glibc-langpack-en @@ -28,6 +35,21 @@ ENV BASH_ENV "/etc/profile" # Install ninja RUN dnf --enablerepo=crb install -y ninja-build +======= +# Update CentOS git version +RUN yum -y remove git +RUN yum -y remove git-* +RUN yum -y install https://packages.endpointdev.com/rhel/7/os/x86_64/endpoint-repo-1.9-1.x86_64.rpm && \ + sed -i 's/packages.endpoint/packages.endpointdev/' /etc/yum.repos.d/endpoint.repo +RUN yum install -y git + +# Install devtoolset +ARG DEVTOOLSET_VERSION +COPY ./common/install_devtoolset.sh install_devtoolset.sh +RUN bash ./install_devtoolset.sh && rm install_devtoolset.sh +ENV BASH_ENV "/etc/profile" + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # (optional) Install non-default glibc version ARG GLIBC_VERSION COPY ./common/install_glibc.sh install_glibc.sh @@ -40,7 +62,10 @@ RUN bash ./install_user.sh && rm install_user.sh # Install conda and other packages (e.g., numpy, pytest) ARG ANACONDA_PYTHON_VERSION +<<<<<<< HEAD ARG CONDA_CMAKE +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ARG BUILD_ENVIRONMENT ENV ANACONDA_PYTHON_VERSION=$ANACONDA_PYTHON_VERSION ENV PATH /opt/conda/envs/py_$ANACONDA_PYTHON_VERSION/bin:/opt/conda/bin:$PATH @@ -49,6 +74,7 @@ COPY ./common/install_conda.sh install_conda.sh COPY ./common/common_utils.sh common_utils.sh RUN bash ./install_conda.sh && rm install_conda.sh common_utils.sh /opt/conda/requirements-ci.txt +<<<<<<< HEAD # (optional) Install protobuf for ONNX ARG PROTOBUF COPY ./common/install_protobuf.sh install_protobuf.sh @@ -57,6 +83,9 @@ RUN rm install_protobuf.sh ENV INSTALLED_PROTOBUF ${PROTOBUF} # (optional) Install vision packages like OpenCV and ffmpeg +======= +# (optional) Install vision packages like OpenCV +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ARG VISION COPY ./common/install_vision.sh ./common/cache_vision_models.sh ./common/common_utils.sh ./ RUN if [ -n "${VISION}" ]; then bash ./install_vision.sh; fi @@ -69,12 +98,19 @@ COPY ./common/install_rocm.sh install_rocm.sh RUN bash ./install_rocm.sh RUN rm install_rocm.sh COPY ./common/install_rocm_magma.sh install_rocm_magma.sh +<<<<<<< HEAD RUN bash ./install_rocm_magma.sh +======= +RUN bash ./install_rocm_magma.sh ${ROCM_VERSION} +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) RUN rm install_rocm_magma.sh COPY ./common/install_amdsmi.sh install_amdsmi.sh RUN bash ./install_amdsmi.sh RUN rm install_amdsmi.sh +<<<<<<< HEAD ENV ROCM_PATH /opt/rocm +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ENV PATH /opt/rocm/bin:$PATH ENV PATH /opt/rocm/hcc/bin:$PATH ENV PATH /opt/rocm/hip/bin:$PATH @@ -84,12 +120,15 @@ ENV MAGMA_HOME /opt/rocm/magma ENV LANG en_US.utf8 ENV LC_ALL en_US.utf8 +<<<<<<< HEAD # (optional) Install non-default CMake version ARG CMAKE_VERSION COPY ./common/install_cmake.sh install_cmake.sh RUN if [ -n "${CMAKE_VERSION}" ]; then bash ./install_cmake.sh; fi RUN rm install_cmake.sh +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # (optional) Install non-default Ninja version ARG NINJA_VERSION COPY ./common/install_ninja.sh install_ninja.sh diff --git a/.ci/docker/ci_commit_pins/executorch.txt b/.ci/docker/ci_commit_pins/executorch.txt index 6e9cfe33fe63..8c6dc1ab2b7e 100644 --- a/.ci/docker/ci_commit_pins/executorch.txt +++ b/.ci/docker/ci_commit_pins/executorch.txt @@ -1 +1,5 @@ +<<<<<<< HEAD ebe8522378c3f9944aaaef44868f5ececdd845fc +======= +56392aa978594cc155fa8af48cd949f5b5f1823a +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) diff --git a/.ci/docker/ci_commit_pins/nccl-cu12.txt b/.ci/docker/ci_commit_pins/nccl-cu12.txt index 4ddb4745d2c4..fa2c32a000de 100644 --- a/.ci/docker/ci_commit_pins/nccl-cu12.txt +++ b/.ci/docker/ci_commit_pins/nccl-cu12.txt @@ -1 +1,5 @@ +<<<<<<< HEAD v2.26.2-1 +======= +v2.27.3-1 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) diff --git a/.ci/docker/ci_commit_pins/triton-xpu.txt b/.ci/docker/ci_commit_pins/triton-xpu.txt index 7669ab74ea7c..258f62c5c597 100644 --- a/.ci/docker/ci_commit_pins/triton-xpu.txt +++ b/.ci/docker/ci_commit_pins/triton-xpu.txt @@ -1 +1,5 @@ +<<<<<<< HEAD 0bcc8265e677e5321606a3311bf71470f14456a8 +======= +ae324eeac8e102a2b40370e341460f3791353398 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) diff --git a/.ci/docker/ci_commit_pins/triton.txt b/.ci/docker/ci_commit_pins/triton.txt index 24d633a34ead..a4aba649eb6a 100644 --- a/.ci/docker/ci_commit_pins/triton.txt +++ b/.ci/docker/ci_commit_pins/triton.txt @@ -1 +1,5 @@ +<<<<<<< HEAD 9c7bc0a3d41407bff948b40cd0e9c793147e49bc +======= +21876a4bbaf371bcb83df8e6ee4f43a92f524dfe +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) diff --git a/.ci/docker/common/cache_vision_models.sh b/.ci/docker/common/cache_vision_models.sh index 8380c48177de..760cbb85cd2a 100644 --- a/.ci/docker/common/cache_vision_models.sh +++ b/.ci/docker/common/cache_vision_models.sh @@ -2,6 +2,7 @@ set -ex +<<<<<<< HEAD # Skip pytorch-nightly installation in docker images # Installation of pytorch-nightly is needed to prefetch mobilenet_v2 avd v3 models for some tests. # Came from https://github.com/ROCm/pytorch/commit/85bd6bc0105162293fa0bbfb7b661f85ec67f85a @@ -16,6 +17,8 @@ set -ex echo "Skip torch-nightly installation" exit 0 +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) source "$(dirname "${BASH_SOURCE[0]}")/common_utils.sh" # Cache the test models at ~/.cache/torch/hub/ diff --git a/.ci/docker/common/install_base.sh b/.ci/docker/common/install_base.sh index 0e15bc931746..968033fcc843 100755 --- a/.ci/docker/common/install_base.sh +++ b/.ci/docker/common/install_base.sh @@ -33,6 +33,7 @@ install_ubuntu() { maybe_libomp_dev="" fi +<<<<<<< HEAD # HACK: UCC testing relies on libnccl library from NVIDIA repo, and version 2.16 crashes # See https://github.com/pytorch/pytorch/pull/105260#issuecomment-1673399729 # TODO: Eliminate this hack, we should not relay on apt-get installation @@ -45,6 +46,8 @@ install_ubuntu() { maybe_libnccl_dev="" fi +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # Install common dependencies apt-get update # TODO: Some of these may not be necessary @@ -73,7 +76,10 @@ install_ubuntu() { libasound2-dev \ libsndfile-dev \ ${maybe_libomp_dev} \ +<<<<<<< HEAD ${maybe_libnccl_dev} \ +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) software-properties-common \ wget \ sudo \ @@ -90,12 +96,15 @@ install_ubuntu() { # see: https://github.com/pytorch/pytorch/issues/65931 apt-get install -y libgnutls30 +<<<<<<< HEAD # Required to install the fortran after gcc update if [[ "$UBUNTU_VERSION" == "22.04"* ]]; then apt autoremove -y gfortran apt-get update -y apt-get install -y gfortran fi +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # Cleanup package manager apt-get autoclean && apt-get clean rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* @@ -104,6 +113,7 @@ install_ubuntu() { install_centos() { # Need EPEL for many packages we depend on. # See http://fedoraproject.org/wiki/EPEL +<<<<<<< HEAD # extras repo is not there for CentOS 9 and epel-release is already part of repo list if [[ $OS_VERSION == 9 ]]; then yum install -y epel-release @@ -119,6 +129,13 @@ install_centos() { # for Caffe2. That said, we still install them to make sure the build # system opts to build/use protoc and libprotobuf from third-party. yum install -y $ALLOW_ERASE \ +======= + yum --enablerepo=extras install -y epel-release + + ccache_deps="asciidoc docbook-dtds docbook-style-xsl libxslt" + numpy_deps="gcc-gfortran" + yum install -y \ +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) $ccache_deps \ $numpy_deps \ autoconf \ @@ -135,13 +152,20 @@ install_centos() { glibc-headers \ glog-devel \ libstdc++-devel \ +<<<<<<< HEAD make \ +======= + libsndfile-devel \ + make \ + opencv-devel \ +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) sudo \ wget \ vim \ unzip \ gdb +<<<<<<< HEAD if [[ $OS_VERSION == 9 ]] then dnf --enablerepo=crb -y install libsndfile-devel @@ -152,6 +176,8 @@ install_centos() { libsndfile-devel fi +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # Cleanup yum clean all rm -rf /var/cache/yum @@ -159,10 +185,15 @@ install_centos() { rm -rf /var/lib/yum/history } +<<<<<<< HEAD ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"') OS_VERSION=$(grep -oP '(?<=^VERSION_ID=).+' /etc/os-release | tr -d '"') # Install base packages depending on the base OS +======= +# Install base packages depending on the base OS +ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"') +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) case "$ID" in ubuntu) install_ubuntu diff --git a/.ci/docker/common/install_cache.sh b/.ci/docker/common/install_cache.sh index 0e181bd7f43d..75b684268ff1 100644 --- a/.ci/docker/common/install_cache.sh +++ b/.ci/docker/common/install_cache.sh @@ -9,7 +9,11 @@ install_ubuntu() { # Instead use lib and headers from OpenSSL1.1 installed in `install_openssl.sh`` apt-get install -y cargo echo "Checking out sccache repo" +<<<<<<< HEAD git clone https://github.com/mozilla/sccache -b v0.9.1 +======= + git clone https://github.com/mozilla/sccache -b v0.10.0 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) cd sccache echo "Building sccache" cargo build --release @@ -36,12 +40,16 @@ sed -e 's|PATH="\(.*\)"|PATH="/opt/cache/bin:\1"|g' -i /etc/environment export PATH="/opt/cache/bin:$PATH" # Setup compiler cache +<<<<<<< HEAD if [ -n "$ROCM_VERSION" ]; then curl --retry 3 http://repo.radeon.com/misc/.sccache_amd/sccache -o /opt/cache/bin/sccache else install_ubuntu fi +======= +install_ubuntu +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) chmod a+x /opt/cache/bin/sccache function write_sccache_stub() { diff --git a/.ci/docker/common/install_clang.sh b/.ci/docker/common/install_clang.sh index f7ef2fb374e4..baca548c0d99 100755 --- a/.ci/docker/common/install_clang.sh +++ b/.ci/docker/common/install_clang.sh @@ -4,6 +4,7 @@ set -ex if [ -n "$CLANG_VERSION" ]; then +<<<<<<< HEAD if [[ $CLANG_VERSION == 9 && $UBUNTU_VERSION == 18.04 ]]; then sudo apt-get update # gpg-agent is not available by default on 18.04 @@ -14,6 +15,12 @@ if [ -n "$CLANG_VERSION" ]; then # work around ubuntu apt-get conflicts sudo apt-get -y -f install wget --no-check-certificate -O - https://apt.llvm.org/llvm-snapshot.gpg.key | sudo apt-key add - +======= + if [[ $UBUNTU_VERSION == 22.04 ]]; then + # work around ubuntu apt-get conflicts + sudo apt-get -y -f install + wget --no-check-certificate -O - https://apt.llvm.org/llvm-snapshot.gpg.key | sudo apt-key add - +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if [[ $CLANG_VERSION == 18 ]]; then apt-add-repository "deb http://apt.llvm.org/jammy/ llvm-toolchain-jammy-18 main" fi @@ -41,7 +48,11 @@ if [ -n "$CLANG_VERSION" ]; then # clang's packaging is a little messed up (the runtime libs aren't # added into the linker path), so give it a little help clang_lib=("/usr/lib/llvm-$CLANG_VERSION/lib/clang/"*"/lib/linux") +<<<<<<< HEAD echo "$clang_lib" > /etc/ld.so.conf.d/clang.conf +======= + echo "$clang_lib" >/etc/ld.so.conf.d/clang.conf +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ldconfig # Cleanup package manager diff --git a/.ci/docker/common/install_conda.sh b/.ci/docker/common/install_conda.sh index ae6d870836b1..d040499f679c 100755 --- a/.ci/docker/common/install_conda.sh +++ b/.ci/docker/common/install_conda.sh @@ -4,12 +4,17 @@ set -ex # Optionally install conda if [ -n "$ANACONDA_PYTHON_VERSION" ]; then +<<<<<<< HEAD BASE_URL="https://repo.anaconda.com/miniconda" CONDA_FILE="Miniconda3-latest-Linux-x86_64.sh" if [[ $(uname -m) == "aarch64" ]] || [[ "$BUILD_ENVIRONMENT" == *xpu* ]] || [[ "$BUILD_ENVIRONMENT" == *rocm* ]]; then BASE_URL="https://github.com/conda-forge/miniforge/releases/latest/download" CONDA_FILE="Miniforge3-Linux-$(uname -m).sh" fi +======= + BASE_URL="https://github.com/conda-forge/miniforge/releases/latest/download" # @lint-ignore + CONDA_FILE="Miniforge3-Linux-$(uname -m).sh" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) MAJOR_PYTHON_VERSION=$(echo "$ANACONDA_PYTHON_VERSION" | cut -d . -f 1) MINOR_PYTHON_VERSION=$(echo "$ANACONDA_PYTHON_VERSION" | cut -d . -f 2) @@ -21,7 +26,10 @@ if [ -n "$ANACONDA_PYTHON_VERSION" ]; then exit 1 ;; esac +<<<<<<< HEAD +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) mkdir -p /opt/conda chown jenkins:jenkins /opt/conda @@ -45,6 +53,7 @@ if [ -n "$ANACONDA_PYTHON_VERSION" ]; then # Prevent conda from updating to 4.14.0, which causes docker build failures # See https://hud.pytorch.org/pytorch/pytorch/commit/754d7f05b6841e555cea5a4b2c505dd9e0baec1d +<<<<<<< HEAD # Uncomment the below when resolved to track the latest conda update, # but this is required for CentOS stream 9 builds ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"') @@ -52,6 +61,10 @@ if [ -n "$ANACONDA_PYTHON_VERSION" ]; then if [[ $ID == centos && $OS_VERSION == 9 ]]; then as_jenkins conda update -y -n base conda fi +======= + # Uncomment the below when resolved to track the latest conda update + # as_jenkins conda update -y -n base conda +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if [[ $(uname -m) == "aarch64" ]]; then export SYSROOT_DEP="sysroot_linux-aarch64=2.17" @@ -85,6 +98,7 @@ if [ -n "$ANACONDA_PYTHON_VERSION" ]; then # and libpython-static for torch deploy conda_install llvmdev=8.0.0 "libpython-static=${ANACONDA_PYTHON_VERSION}" +<<<<<<< HEAD # Use conda cmake in some cases. Conda cmake will be newer than our supported # min version (3.5 for xenial and 3.10 for bionic), so we only do it in those # following builds that we know should use conda. Specifically, Ubuntu bionic @@ -93,11 +107,17 @@ if [ -n "$ANACONDA_PYTHON_VERSION" ]; then conda_install cmake=3.31.6 fi +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # Magma package names are concatenation of CUDA major and minor ignoring revision # I.e. magma-cuda102 package corresponds to CUDA_VERSION=10.2 and CUDA_VERSION=10.2.89 # Magma is installed from a tarball in the ossci-linux bucket into the conda env if [ -n "$CUDA_VERSION" ]; then +<<<<<<< HEAD ${SCRIPT_FOLDER}/install_magma_conda.sh $(cut -f1-2 -d'.' <<< ${CUDA_VERSION}) ${ANACONDA_PYTHON_VERSION} +======= + conda_run ${SCRIPT_FOLDER}/install_magma_conda.sh $(cut -f1-2 -d'.' <<< ${CUDA_VERSION}) +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) fi if [[ "$UBUNTU_VERSION" == "24.04"* ]] ; then diff --git a/.ci/docker/common/install_cpython.sh b/.ci/docker/common/install_cpython.sh index c6a9b27721b8..32e5f37c9ea8 100755 --- a/.ci/docker/common/install_cpython.sh +++ b/.ci/docker/common/install_cpython.sh @@ -3,11 +3,19 @@ set -uex -o pipefail PYTHON_DOWNLOAD_URL=https://www.python.org/ftp/python +<<<<<<< HEAD PYTHON_DOWNLOAD_GITHUB_BRANCH=https://github.com/python/cpython/archive/refs/heads GET_PIP_URL=https://bootstrap.pypa.io/get-pip.py # Python versions to be installed in /opt/$VERSION_NO CPYTHON_VERSIONS=${CPYTHON_VERSIONS:-"3.8.1 3.9.0 3.10.1 3.11.0 3.12.0 3.13.0 3.13.0t"} +======= +PYTHON_DOWNLOAD_GITHUB_BRANCH=https://github.com/python/cpython/archive/refs/heads # @lint-ignore +GET_PIP_URL=https://bootstrap.pypa.io/get-pip.py + +# Python versions to be installed in /opt/$VERSION_NO +CPYTHON_VERSIONS=${CPYTHON_VERSIONS:-"3.9.0 3.10.1 3.11.0 3.12.0 3.13.0 3.13.0t"} +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) function check_var { if [ -z "$1" ]; then diff --git a/.ci/docker/common/install_cuda.sh b/.ci/docker/common/install_cuda.sh index 943e8826e1ee..aa3ce39aea4a 100644 --- a/.ci/docker/common/install_cuda.sh +++ b/.ci/docker/common/install_cuda.sh @@ -2,6 +2,7 @@ set -ex +<<<<<<< HEAD NCCL_VERSION=v2.26.2-1 CUDNN_VERSION=9.5.1.17 @@ -136,10 +137,61 @@ function install_126 { rm -rf nccl install_cusparselt_063 +======= +arch_path='' +targetarch=${TARGETARCH:-$(uname -m)} +if [ ${targetarch} = 'amd64' ] || [ "${targetarch}" = 'x86_64' ]; then + arch_path='x86_64' +else + arch_path='sbsa' +fi + +function install_cuda { + version=$1 + runfile=$2 + major_minor=${version%.*} + rm -rf /usr/local/cuda-${major_minor} /usr/local/cuda + if [[ ${arch_path} == 'sbsa' ]]; then + runfile="${runfile}_sbsa" + fi + runfile="${runfile}.run" + wget -q https://developer.download.nvidia.com/compute/cuda/${version}/local_installers/${runfile} -O ${runfile} + chmod +x ${runfile} + ./${runfile} --toolkit --silent + rm -f ${runfile} + rm -f /usr/local/cuda && ln -s /usr/local/cuda-${major_minor} /usr/local/cuda +} + +function install_cudnn { + cuda_major_version=$1 + cudnn_version=$2 + mkdir tmp_cudnn && cd tmp_cudnn + # cuDNN license: https://developer.nvidia.com/cudnn/license_agreement + filepath="cudnn-linux-${arch_path}-${cudnn_version}_cuda${cuda_major_version}-archive" + wget -q https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/linux-${arch_path}/${filepath}.tar.xz + tar xf ${filepath}.tar.xz + cp -a ${filepath}/include/* /usr/local/cuda/include/ + cp -a ${filepath}/lib/* /usr/local/cuda/lib64/ + cd .. + rm -rf tmp_cudnn +} + +function install_126 { + CUDNN_VERSION=9.10.2.21 + echo "Installing CUDA 12.6.3 and cuDNN ${CUDNN_VERSION} and NCCL and cuSparseLt-0.7.1" + install_cuda 12.6.3 cuda_12.6.3_560.35.05_linux + + install_cudnn 12 $CUDNN_VERSION + + CUDA_VERSION=12.6 bash install_nccl.sh + + CUDA_VERSION=12.6 bash install_cusparselt.sh +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ldconfig } +<<<<<<< HEAD function prune_118 { echo "Pruning CUDA 11.8 and cuDNN" ##################################################################################### @@ -203,6 +255,22 @@ function prune_124 { ##################################################################################### export CUDA_BASE="/usr/local/cuda-12.4/" rm -rf $CUDA_BASE/libnvvp $CUDA_BASE/nsightee_plugins $CUDA_BASE/nsight-compute-2024.1.0 $CUDA_BASE/nsight-systems-2023.4.4/ +======= +function install_129 { + CUDNN_VERSION=9.10.2.21 + echo "Installing CUDA 12.9.1 and cuDNN ${CUDNN_VERSION} and NCCL and cuSparseLt-0.7.1" + # install CUDA 12.9.1 in the same container + install_cuda 12.9.1 cuda_12.9.1_575.57.08_linux + + # cuDNN license: https://developer.nvidia.com/cudnn/license_agreement + install_cudnn 12 $CUDNN_VERSION + + CUDA_VERSION=12.9 bash install_nccl.sh + + CUDA_VERSION=12.9 bash install_cusparselt.sh + + ldconfig +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } function prune_126 { @@ -240,6 +308,7 @@ function prune_126 { } function install_128 { +<<<<<<< HEAD CUDNN_VERSION=9.7.1.26 echo "Installing CUDA 12.8.0 and cuDNN ${CUDNN_VERSION} and NCCL ${NCCL_VERSION} and cuSparseLt-0.6.3" rm -rf /usr/local/cuda-12.8 /usr/local/cuda @@ -269,6 +338,19 @@ function install_128 { rm -rf nccl install_cusparselt_063 +======= + CUDNN_VERSION=9.8.0.87 + echo "Installing CUDA 12.8.1 and cuDNN ${CUDNN_VERSION} and NCCL and cuSparseLt-0.7.1" + # install CUDA 12.8.1 in the same container + install_cuda 12.8.1 cuda_12.8.1_570.124.06_linux + + # cuDNN license: https://developer.nvidia.com/cudnn/license_agreement + install_cudnn 12 $CUDNN_VERSION + + CUDA_VERSION=12.8 bash install_nccl.sh + + CUDA_VERSION=12.8 bash install_cusparselt.sh +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ldconfig } @@ -277,6 +359,7 @@ function install_128 { while test $# -gt 0 do case "$1" in +<<<<<<< HEAD 11.8) install_118; prune_118 ;; 12.4) install_124; prune_124 @@ -284,6 +367,13 @@ do 12.6) install_126; prune_126 ;; 12.8) install_128; +======= + 12.6|12.6.*) install_126; prune_126 + ;; + 12.8|12.8.*) install_128; + ;; + 12.9|12.9.*) install_129; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ;; *) echo "bad argument $1"; exit 1 ;; diff --git a/.ci/docker/common/install_cudnn.sh b/.ci/docker/common/install_cudnn.sh index e008cda5c7a6..90b8cd58fe69 100644 --- a/.ci/docker/common/install_cudnn.sh +++ b/.ci/docker/common/install_cudnn.sh @@ -4,12 +4,19 @@ if [[ -n "${CUDNN_VERSION}" ]]; then # cuDNN license: https://developer.nvidia.com/cudnn/license_agreement mkdir tmp_cudnn pushd tmp_cudnn +<<<<<<< HEAD if [[ ${CUDA_VERSION:0:4} == "12.8" ]]; then CUDNN_NAME="cudnn-linux-x86_64-9.7.1.26_cuda12-archive" elif [[ ${CUDA_VERSION:0:4} == "12.6" ]]; then CUDNN_NAME="cudnn-linux-x86_64-9.5.1.17_cuda12-archive" elif [[ ${CUDA_VERSION:0:2} == "12" ]]; then CUDNN_NAME="cudnn-linux-x86_64-9.1.0.70_cuda12-archive" +======= + if [[ ${CUDA_VERSION:0:4} == "12.9" || ${CUDA_VERSION:0:4} == "12.8" ]]; then + CUDNN_NAME="cudnn-linux-x86_64-9.10.2.21_cuda12-archive" + elif [[ ${CUDA_VERSION:0:4} == "12.6" ]]; then + CUDNN_NAME="cudnn-linux-x86_64-9.10.2.21_cuda12-archive" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) elif [[ ${CUDA_VERSION:0:2} == "11" ]]; then CUDNN_NAME="cudnn-linux-x86_64-9.1.0.70_cuda11-archive" else diff --git a/.ci/docker/common/install_cusparselt.sh b/.ci/docker/common/install_cusparselt.sh index 0603739fb041..19debfcb400f 100644 --- a/.ci/docker/common/install_cusparselt.sh +++ b/.ci/docker/common/install_cusparselt.sh @@ -5,12 +5,17 @@ set -ex # cuSPARSELt license: https://docs.nvidia.com/cuda/cusparselt/license.html mkdir tmp_cusparselt && cd tmp_cusparselt +<<<<<<< HEAD if [[ ${CUDA_VERSION:0:4} =~ ^12\.[5-8]$ ]]; then +======= +if [[ ${CUDA_VERSION:0:4} =~ ^12\.[5-9]$ ]]; then +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) arch_path='sbsa' export TARGETARCH=${TARGETARCH:-$(uname -m)} if [ ${TARGETARCH} = 'amd64' ] || [ "${TARGETARCH}" = 'x86_64' ]; then arch_path='x86_64' fi +<<<<<<< HEAD CUSPARSELT_NAME="libcusparse_lt-linux-${arch_path}-0.6.3.2-archive" curl --retry 3 -OLs https://developer.download.nvidia.com/compute/cusparselt/redist/libcusparse_lt/linux-${arch_path}/${CUSPARSELT_NAME}.tar.xz elif [[ ${CUDA_VERSION:0:4} == "12.4" ]]; then @@ -24,6 +29,10 @@ elif [[ ${CUDA_VERSION:0:4} == "12.4" ]]; then elif [[ ${CUDA_VERSION:0:4} == "11.8" ]]; then CUSPARSELT_NAME="libcusparse_lt-linux-x86_64-0.4.0.7-archive" curl --retry 3 -OLs https://developer.download.nvidia.com/compute/cusparselt/redist/libcusparse_lt/linux-x86_64/${CUSPARSELT_NAME}.tar.xz +======= + CUSPARSELT_NAME="libcusparse_lt-linux-${arch_path}-0.7.1.0-archive" + curl --retry 3 -OLs https://developer.download.nvidia.com/compute/cusparselt/redist/libcusparse_lt/linux-${arch_path}/${CUSPARSELT_NAME}.tar.xz +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) else echo "Not sure which libcusparselt version to install for this ${CUDA_VERSION}" fi diff --git a/.ci/docker/common/install_executorch.sh b/.ci/docker/common/install_executorch.sh index e30e0a787bbe..b7ea7e3f2cfa 100755 --- a/.ci/docker/common/install_executorch.sh +++ b/.ci/docker/common/install_executorch.sh @@ -13,7 +13,11 @@ clone_executorch() { # and fetch the target commit pushd executorch git checkout "${EXECUTORCH_PINNED_COMMIT}" +<<<<<<< HEAD git submodule update --init +======= + git submodule update --init --recursive +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) popd chown -R jenkins executorch diff --git a/.ci/docker/common/install_halide.sh b/.ci/docker/common/install_halide.sh index ed1d7d33649d..d589a179a67f 100644 --- a/.ci/docker/common/install_halide.sh +++ b/.ci/docker/common/install_halide.sh @@ -17,7 +17,11 @@ if [ -n "${UBUNTU_VERSION}" ];then libopenblas-dev libeigen3-dev libatlas-base-dev libzstd-dev fi +<<<<<<< HEAD conda_install numpy scipy imageio cmake ninja +======= +pip_install numpy scipy imageio cmake ninja +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) git clone --depth 1 --branch release/16.x --recursive https://github.com/llvm/llvm-project.git cmake -DCMAKE_BUILD_TYPE=Release \ diff --git a/.ci/docker/common/install_inductor_benchmark_deps.sh b/.ci/docker/common/install_inductor_benchmark_deps.sh index 5b775af6539f..28b145413aae 100644 --- a/.ci/docker/common/install_inductor_benchmark_deps.sh +++ b/.ci/docker/common/install_inductor_benchmark_deps.sh @@ -16,7 +16,11 @@ function install_timm() { pip_install "git+https://github.com/huggingface/pytorch-image-models@${commit}" # Clean up +<<<<<<< HEAD conda_run pip uninstall -y cmake torch torchvision triton +======= + conda_run pip uninstall -y torch torchvision triton +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } # Pango is needed for weasyprint which is needed for doctr diff --git a/.ci/docker/common/install_linter.sh b/.ci/docker/common/install_linter.sh index a7f008fb735d..1d694604472a 100644 --- a/.ci/docker/common/install_linter.sh +++ b/.ci/docker/common/install_linter.sh @@ -2,8 +2,11 @@ set -ex +<<<<<<< HEAD source "$(dirname "${BASH_SOURCE[0]}")/common_utils.sh" +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if [ -n "${UBUNTU_VERSION}" ]; then apt update apt-get install -y clang doxygen git graphviz nodejs npm libtinfo5 @@ -15,8 +18,13 @@ chown -R jenkins pytorch pushd pytorch # Install all linter dependencies +<<<<<<< HEAD pip_install -r requirements.txt conda_run lintrunner init +======= +pip install -r requirements.txt +lintrunner init +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # Cache .lintbin directory as part of the Docker image cp -r .lintbin /tmp diff --git a/.ci/docker/common/install_magma_conda.sh b/.ci/docker/common/install_magma_conda.sh index 9557d943004c..62f6303a3166 100755 --- a/.ci/docker/common/install_magma_conda.sh +++ b/.ci/docker/common/install_magma_conda.sh @@ -1,4 +1,5 @@ #!/usr/bin/env bash +<<<<<<< HEAD # Script that replaces the magma install from a conda package set -eou pipefail @@ -24,3 +25,27 @@ function do_install() { } do_install $1 $2 +======= +# Script that installs magma from tarball inside conda environment. +# It replaces anaconda magma-cuda package which is no longer published. +# Execute it inside active conda environment. +# See issue: https://github.com/pytorch/pytorch/issues/138506 + +set -eou pipefail + +cuda_version_nodot=${1/./} +anaconda_dir=${CONDA_PREFIX:-"$(dirname $(which conda))/../"} + +MAGMA_VERSION="2.6.1" +magma_archive="magma-cuda${cuda_version_nodot}-${MAGMA_VERSION}-1.tar.bz2" +( + set -x + tmp_dir=$(mktemp -d) + pushd ${tmp_dir} + curl -OLs https://ossci-linux.s3.us-east-1.amazonaws.com/${magma_archive} + tar -xvf "${magma_archive}" + mv include/* "${anaconda_dir}/include/" + mv lib/* "${anaconda_dir}/lib" + popd +) +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) diff --git a/.ci/docker/common/install_nccl.sh b/.ci/docker/common/install_nccl.sh new file mode 100644 index 000000000000..17d80ebe7d27 --- /dev/null +++ b/.ci/docker/common/install_nccl.sh @@ -0,0 +1,26 @@ +#!/bin/bash + +set -ex + +NCCL_VERSION="" +if [[ ${CUDA_VERSION:0:2} == "11" ]]; then + NCCL_VERSION=$(cat ci_commit_pins/nccl-cu11.txt) +elif [[ ${CUDA_VERSION:0:2} == "12" ]]; then + NCCL_VERSION=$(cat ci_commit_pins/nccl-cu12.txt) +else + echo "Unexpected CUDA_VERSION ${CUDA_VERSION}" + exit 1 +fi + +if [[ -n "${NCCL_VERSION}" ]]; then + # NCCL license: https://docs.nvidia.com/deeplearning/nccl/#licenses + # Follow build: https://github.com/NVIDIA/nccl/tree/master?tab=readme-ov-file#build + git clone -b $NCCL_VERSION --depth 1 https://github.com/NVIDIA/nccl.git + pushd nccl + make -j src.build + cp -a build/include/* /usr/local/cuda/include/ + cp -a build/lib/* /usr/local/cuda/lib64/ + popd + rm -rf nccl + ldconfig +fi diff --git a/.ci/docker/common/install_onnx.sh b/.ci/docker/common/install_onnx.sh index fdd0f9acf135..51a52792945f 100755 --- a/.ci/docker/common/install_onnx.sh +++ b/.ci/docker/common/install_onnx.sh @@ -8,6 +8,7 @@ retry () { "$@" || (sleep 10 && "$@") || (sleep 20 && "$@") || (sleep 40 && "$@") } +<<<<<<< HEAD # A bunch of custom pip dependencies for ONNX pip_install \ beartype==0.15.0 \ @@ -18,6 +19,8 @@ pip_install \ networkx==2.5 \ numpy==1.24.2 +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # ONNXRuntime should be installed before installing # onnx-weekly. Otherwise, onnx-weekly could be # overwritten by onnx. @@ -29,12 +32,17 @@ pip_install \ transformers==4.36.2 pip_install coloredlogs packaging +<<<<<<< HEAD pip_install onnxruntime==1.18.1 pip_install onnx==1.17.0 pip_install onnxscript==0.2.2 --no-deps # required by onnxscript pip_install ml_dtypes +======= +pip_install onnxruntime==1.18.1 +pip_install onnxscript==0.3.1 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # Cache the transformers model to be used later by ONNX tests. We need to run the transformers # package to download the model. By default, the model is cached at ~/.cache/huggingface/hub/ diff --git a/.ci/docker/common/install_openblas.sh b/.ci/docker/common/install_openblas.sh index 7f0b3620bdc1..e71a678a3483 100644 --- a/.ci/docker/common/install_openblas.sh +++ b/.ci/docker/common/install_openblas.sh @@ -4,9 +4,15 @@ set -ex cd / +<<<<<<< HEAD git clone https://github.com/OpenMathLib/OpenBLAS.git -b v0.3.29 --depth 1 --shallow-submodules +======= +git clone https://github.com/OpenMathLib/OpenBLAS.git -b "${OPENBLAS_VERSION:-v0.3.30}" --depth 1 --shallow-submodules + +OPENBLAS_CHECKOUT_DIR="OpenBLAS" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) OPENBLAS_BUILD_FLAGS=" NUM_THREADS=128 USE_OPENMP=1 @@ -14,9 +20,15 @@ NO_SHARED=0 DYNAMIC_ARCH=1 TARGET=ARMV8 CFLAGS=-O3 +<<<<<<< HEAD " OPENBLAS_CHECKOUT_DIR="OpenBLAS" +======= +BUILD_BFLOAT16=1 +" + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) make -j8 ${OPENBLAS_BUILD_FLAGS} -C ${OPENBLAS_CHECKOUT_DIR} make -j8 ${OPENBLAS_BUILD_FLAGS} install -C ${OPENBLAS_CHECKOUT_DIR} diff --git a/.ci/docker/common/install_python.sh b/.ci/docker/common/install_python.sh new file mode 100644 index 000000000000..be5a09b80a60 --- /dev/null +++ b/.ci/docker/common/install_python.sh @@ -0,0 +1,15 @@ +#!/bin/bash +set -ex + +apt-get update +# Use deadsnakes in case we need an older python version +sudo add-apt-repository ppa:deadsnakes/ppa +apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python3-pip python${PYTHON_VERSION}-venv + +# Use a venv because uv and some other package managers don't support --user install +ln -s /usr/bin/python${PYTHON_VERSION} /usr/bin/python +python -m venv /var/lib/jenkins/ci_env +source /var/lib/jenkins/ci_env/bin/activate + +python -mpip install --upgrade pip +python -mpip install -r /opt/requirements-ci.txt diff --git a/.ci/docker/common/install_rocm.sh b/.ci/docker/common/install_rocm.sh index 5f655f2010d4..39de64f7006f 100644 --- a/.ci/docker/common/install_rocm.sh +++ b/.ci/docker/common/install_rocm.sh @@ -21,6 +21,21 @@ install_ubuntu() { apt-get install -y libc++1 apt-get install -y libc++abi1 +<<<<<<< HEAD +======= + # Make sure rocm packages from repo.radeon.com have highest priority + cat << EOF > /etc/apt/preferences.d/rocm-pin-600 +Package: * +Pin: release o=repo.radeon.com +Pin-Priority: 600 +EOF + + # we want the patch version of 6.4 instead + if [[ $(ver $ROCM_VERSION) -eq $(ver 6.4) ]]; then + ROCM_VERSION="${ROCM_VERSION}.1" + fi + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # Add amdgpu repository UBUNTU_VERSION_NAME=`cat /etc/os-release | grep UBUNTU_CODENAME | awk -F= '{print $2}'` echo "deb [arch=amd64] https://repo.radeon.com/amdgpu/${ROCM_VERSION}/ubuntu ${UBUNTU_VERSION_NAME} main" > /etc/apt/sources.list.d/amdgpu.list @@ -61,17 +76,42 @@ install_ubuntu() { done # ROCm 6.3 had a regression where initializing static code objects had significant overhead +<<<<<<< HEAD if [[ $(ver $ROCM_VERSION) -eq $(ver 6.3) ]]; then # clr build needs CppHeaderParser but can only find it using conda's python /opt/conda/bin/python -m pip install CppHeaderParser git clone https://github.com/ROCm/HIP -b rocm-6.3.x HIP_COMMON_DIR=$(readlink -f HIP) git clone https://github.com/jeffdaily/clr -b release/rocm-rel-6.3-statco-hotfix +======= + # ROCm 6.4 did not yet fix the regression, also HIP branch names are different + if [[ $(ver $ROCM_VERSION) -ge $(ver 6.3) ]] && [[ $(ver $ROCM_VERSION) -lt $(ver 7.0) ]]; then + if [[ $(ver $ROCM_VERSION) -eq $(ver 6.4.1) ]]; then + HIP_BRANCH=release/rocm-rel-6.4 + VER_STR=6.4 + VER_PATCH=.1 + elif [[ $(ver $ROCM_VERSION) -eq $(ver 6.4) ]]; then + HIP_BRANCH=release/rocm-rel-6.4 + VER_STR=6.4 + elif [[ $(ver $ROCM_VERSION) -eq $(ver 6.3) ]]; then + HIP_BRANCH=rocm-6.3.x + VER_STR=6.3 + fi + # clr build needs CppHeaderParser but can only find it using conda's python + /opt/conda/bin/python -m pip install CppHeaderParser + git clone https://github.com/ROCm/HIP -b $HIP_BRANCH + HIP_COMMON_DIR=$(readlink -f HIP) + git clone https://github.com/jeffdaily/clr -b release/rocm-rel-${VER_STR}${VER_PATCH}-statco-hotfix +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) mkdir -p clr/build pushd clr/build cmake .. -DCLR_BUILD_HIP=ON -DHIP_COMMON_DIR=$HIP_COMMON_DIR make -j +<<<<<<< HEAD cp hipamd/lib/libamdhip64.so.6.3.* /opt/rocm/lib/libamdhip64.so.6.3.* +======= + cp hipamd/lib/libamdhip64.so.${VER_STR}.* /opt/rocm/lib/libamdhip64.so.${VER_STR}.* +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) popd rm -rf HIP clr fi @@ -86,6 +126,7 @@ install_centos() { yum update -y yum install -y kmod yum install -y wget +<<<<<<< HEAD if [[ $OS_VERSION == 9 ]]; then dnf install -y openblas-serial @@ -122,6 +163,28 @@ install_centos() { else local rocm_baseurl="http://repo.radeon.com/rocm/yum/${ROCM_VERSION}/main" fi +======= + yum install -y openblas-devel + + yum install -y epel-release + yum install -y dkms kernel-headers-`uname -r` kernel-devel-`uname -r` + + # Add amdgpu repository + local amdgpu_baseurl + if [[ $OS_VERSION == 9 ]]; then + amdgpu_baseurl="https://repo.radeon.com/amdgpu/${ROCM_VERSION}/rhel/9.0/main/x86_64" + else + amdgpu_baseurl="https://repo.radeon.com/amdgpu/${ROCM_VERSION}/rhel/7.9/main/x86_64" + fi + echo "[AMDGPU]" > /etc/yum.repos.d/amdgpu.repo + echo "name=AMDGPU" >> /etc/yum.repos.d/amdgpu.repo + echo "baseurl=${amdgpu_baseurl}" >> /etc/yum.repos.d/amdgpu.repo + echo "enabled=1" >> /etc/yum.repos.d/amdgpu.repo + echo "gpgcheck=1" >> /etc/yum.repos.d/amdgpu.repo + echo "gpgkey=http://repo.radeon.com/rocm/rocm.gpg.key" >> /etc/yum.repos.d/amdgpu.repo + + local rocm_baseurl="http://repo.radeon.com/rocm/yum/${ROCM_VERSION}" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) echo "[ROCm]" > /etc/yum.repos.d/rocm.repo echo "name=ROCm" >> /etc/yum.repos.d/rocm.repo echo "baseurl=${rocm_baseurl}" >> /etc/yum.repos.d/rocm.repo @@ -129,6 +192,7 @@ install_centos() { echo "gpgcheck=1" >> /etc/yum.repos.d/rocm.repo echo "gpgkey=http://repo.radeon.com/rocm/rocm.gpg.key" >> /etc/yum.repos.d/rocm.repo +<<<<<<< HEAD if [[ $OS_VERSION == 9 ]]; then yum update -y --nogpgcheck dnf --enablerepo=crb install -y perl-File-BaseDir python3-wheel @@ -136,6 +200,11 @@ install_centos() { else yum update -y yum install -y \ +======= + yum update -y + + yum install -y \ +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) rocm-dev \ rocm-utils \ rocm-libs \ @@ -143,7 +212,10 @@ install_centos() { rocprofiler-dev \ roctracer-dev \ amd-smi-lib +<<<<<<< HEAD fi +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # precompiled miopen kernels; search for all unversioned packages # if search fails it will abort this script; use true to avoid case where search fails @@ -167,8 +239,11 @@ install_centos() { rm -rf /var/lib/yum/history } +<<<<<<< HEAD OS_VERSION=$(grep -oP '(?<=^VERSION_ID=).+' /etc/os-release | tr -d '"') +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # Install Python packages depending on the base OS ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"') case "$ID" in diff --git a/.ci/docker/common/install_rocm_magma.sh b/.ci/docker/common/install_rocm_magma.sh index db826ed6e027..d3677483ea7c 100644 --- a/.ci/docker/common/install_rocm_magma.sh +++ b/.ci/docker/common/install_rocm_magma.sh @@ -1,3 +1,4 @@ +<<<<<<< HEAD #!/bin/bash # Script used in CI and CD pipeline @@ -58,3 +59,42 @@ LANG=C.UTF-8 make lib/libmagma.so -j $(nproc) MKLROOT="${MKLROOT}" make testing/testing_dgemm -j $(nproc) MKLROOT="${MKLROOT}" popd mv magma /opt/rocm +======= +#!/usr/bin/env bash +# Script used only in CD pipeline + +set -eou pipefail + +function do_install() { + rocm_version=$1 + if [[ ${rocm_version} =~ ^[0-9]+\.[0-9]+\.[0-9]+$ ]]; then + # chop off any patch version + rocm_version="${rocm_version%.*}" + fi + + rocm_version_nodot=${rocm_version//./} + + # Version 2.7.2 + ROCm related updates + MAGMA_VERSION=a1625ff4d9bc362906bd01f805dbbe12612953f6 + magma_archive="magma-rocm${rocm_version_nodot}-${MAGMA_VERSION}-1.tar.bz2" + + rocm_dir="/opt/rocm" + ( + set -x + tmp_dir=$(mktemp -d) + pushd ${tmp_dir} + curl -OLs https://ossci-linux.s3.us-east-1.amazonaws.com/${magma_archive} + if tar -xvf "${magma_archive}" + then + mkdir -p "${rocm_dir}/magma" + mv include "${rocm_dir}/magma/include" + mv lib "${rocm_dir}/magma/lib" + else + echo "${magma_archive} not found, skipping magma install" + fi + popd + ) +} + +do_install $1 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) diff --git a/.ci/docker/common/install_triton.sh b/.ci/docker/common/install_triton.sh index b72020d822d5..f2a3363935f1 100755 --- a/.ci/docker/common/install_triton.sh +++ b/.ci/docker/common/install_triton.sh @@ -2,6 +2,7 @@ set -ex +<<<<<<< HEAD source "$(dirname "${BASH_SOURCE[0]}")/common_utils.sh" get_conda_version() { @@ -10,6 +11,18 @@ get_conda_version() { conda_reinstall() { as_jenkins conda install -q -n py_$ANACONDA_PYTHON_VERSION -y --force-reinstall $* +======= +mkdir -p /opt/triton +if [ -z "${TRITON}" ] && [ -z "${TRITON_CPU}" ]; then + echo "TRITON and TRITON_CPU are not set. Exiting..." + exit 0 +fi + +source "$(dirname "${BASH_SOURCE[0]}")/common_utils.sh" + +get_pip_version() { + conda_run pip list | grep -w $* | head -n 1 | awk '{print $2}' +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } if [ -n "${XPU_VERSION}" ]; then @@ -31,11 +44,17 @@ if [ -n "${UBUNTU_VERSION}" ];then apt-get install -y gpg-agent fi +<<<<<<< HEAD if [ -n "${CONDA_CMAKE}" ]; then # Keep the current cmake and numpy version here, so we can reinstall them later CMAKE_VERSION=$(get_conda_version cmake) NUMPY_VERSION=$(get_conda_version numpy) fi +======= +# Keep the current cmake and numpy version here, so we can reinstall them later +CMAKE_VERSION=$(get_pip_version cmake) +NUMPY_VERSION=$(get_pip_version numpy) +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if [ -z "${MAX_JOBS}" ]; then export MAX_JOBS=$(nproc) @@ -51,7 +70,17 @@ as_jenkins git clone --recursive ${TRITON_REPO} triton cd triton as_jenkins git checkout ${TRITON_PINNED_COMMIT} as_jenkins git submodule update --init --recursive +<<<<<<< HEAD cd python +======= + +# Old versions of python have setup.py in ./python; newer versions have it in ./ +if [ ! -f setup.py ]; then + cd python +fi + +pip_install pybind11==2.13.6 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # TODO: remove patch setup.py once we have a proper fix for https://github.com/triton-lang/triton/issues/4527 as_jenkins sed -i -e 's/https:\/\/tritonlang.blob.core.windows.net\/llvm-builds/https:\/\/oaitriton.blob.core.windows.net\/public\/llvm-builds/g' setup.py @@ -60,12 +89,17 @@ if [ -n "${UBUNTU_VERSION}" ] && [ -n "${GCC_VERSION}" ] && [[ "${GCC_VERSION}" # Triton needs at least gcc-9 to build apt-get install -y g++-9 +<<<<<<< HEAD CXX=g++-9 pip_install . +======= + CXX=g++-9 conda_run python setup.py bdist_wheel +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) elif [ -n "${UBUNTU_VERSION}" ] && [ -n "${CLANG_VERSION}" ]; then # Triton needs which surprisingly is not available with clang-9 toolchain add-apt-repository -y ppa:ubuntu-toolchain-r/test apt-get install -y g++-9 +<<<<<<< HEAD CXX=g++-9 pip_install . else pip_install . @@ -84,4 +118,34 @@ if [ -n "${CONDA_CMAKE}" ]; then conda_reinstall cmake="${CMAKE_VERSION}" # Note that we install numpy with pip as conda might not have the version we want pip_install --force-reinstall numpy=="${NUMPY_VERSION}" +======= + CXX=g++-9 conda_run python setup.py bdist_wheel +else + conda_run python setup.py bdist_wheel +fi + +# Copy the wheel to /opt for multi stage docker builds +cp dist/*.whl /opt/triton +# Install the wheel for docker builds that don't use multi stage +pip_install dist/*.whl + +# TODO: This is to make sure that the same cmake and numpy version from install conda +# script is used. Without this step, the newer cmake version (3.25.2) downloaded by +# triton build step via pip will fail to detect conda MKL. Once that issue is fixed, +# this can be removed. +# +# The correct numpy version also needs to be set here because conda claims that it +# causes inconsistent environment. Without this, conda will attempt to install the +# latest numpy version, which fails ASAN tests with the following import error: Numba +# needs NumPy 1.20 or less. +# Note that we install numpy with pip as conda might not have the version we want +if [ -n "${CMAKE_VERSION}" ]; then + pip_install "cmake==${CMAKE_VERSION}" +fi +if [ -n "${NUMPY_VERSION}" ]; then + pip_install "numpy==${NUMPY_VERSION}" +fi +if [[ "$ANACONDA_PYTHON_VERSION" != 3.9* ]]; then + pip_install helion +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) fi diff --git a/.ci/docker/common/install_vision.sh b/.ci/docker/common/install_vision.sh index 532d8d14a55c..665b8c0805c6 100755 --- a/.ci/docker/common/install_vision.sh +++ b/.ci/docker/common/install_vision.sh @@ -15,6 +15,7 @@ install_ubuntu() { install_centos() { # Need EPEL for many packages we depend on. # See http://fedoraproject.org/wiki/EPEL +<<<<<<< HEAD if [[ $OS_VERSION == 9 ]]; then yum install -y epel-release else @@ -23,6 +24,12 @@ install_centos() { opencv-devel \ ffmpeg-devel fi +======= + yum --enablerepo=extras install -y epel-release + + yum install -y \ + opencv-devel +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # Cleanup yum clean all @@ -31,8 +38,11 @@ install_centos() { rm -rf /var/lib/yum/history } +<<<<<<< HEAD OS_VERSION=$(grep -oP '(?<=^VERSION_ID=).+' /etc/os-release | tr -d '"') +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # Install base packages depending on the base OS ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"') case "$ID" in diff --git a/.ci/docker/common/install_xpu.sh b/.ci/docker/common/install_xpu.sh index 08e6f3aa6d1a..2270497f0025 100644 --- a/.ci/docker/common/install_xpu.sh +++ b/.ci/docker/common/install_xpu.sh @@ -26,7 +26,11 @@ function install_ubuntu() { wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB \ | gpg --dearmor > /usr/share/keyrings/oneapi-archive-keyring.gpg.gpg echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg.gpg] \ +<<<<<<< HEAD https://apt.repos.intel.com/${XPU_REPO_NAME} all main" \ +======= + https://apt.repos.intel.com/oneapi all main" \ +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) | tee /etc/apt/sources.list.d/oneAPI.list # Update the packages list and repository index @@ -47,9 +51,12 @@ function install_ubuntu() { # Development Packages apt-get install -y libigc-dev intel-igc-cm libigdfcl-dev libigfxcmrt-dev level-zero-dev # Install Intel Support Packages +<<<<<<< HEAD if [[ "$XPU_VERSION" == "2025.0" ]]; then XPU_PACKAGES="${XPU_PACKAGES} intel-oneapi-dnnl=2025.0.1-6" fi +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) apt-get install -y ${XPU_PACKAGES} # Cleanup @@ -77,7 +84,11 @@ function install_rhel() { tee > /etc/yum.repos.d/oneAPI.repo << EOF [oneAPI] name=Intel for Pytorch GPU dev repository +<<<<<<< HEAD baseurl=https://yum.repos.intel.com/${XPU_REPO_NAME} +======= +baseurl=https://yum.repos.intel.com/oneapi +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) enabled=1 gpgcheck=1 repo_gpgcheck=1 @@ -85,9 +96,12 @@ gpgkey=https://yum.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS. EOF # Install Intel Support Packages +<<<<<<< HEAD if [[ "$XPU_VERSION" == "2025.0" ]]; then XPU_PACKAGES="${XPU_PACKAGES} intel-oneapi-dnnl-2025.0.1-6" fi +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) yum install -y ${XPU_PACKAGES} # The xpu-smi packages dnf install -y xpu-smi @@ -124,7 +138,11 @@ function install_sles() { https://repositories.intel.com/gpu/sles/${VERSION_SP}${XPU_DRIVER_VERSION}/unified/intel-gpu-${VERSION_SP}.repo rpm --import https://repositories.intel.com/gpu/intel-graphics.key # To add the online network network package repository for the Intel Support Packages +<<<<<<< HEAD zypper addrepo https://yum.repos.intel.com/${XPU_REPO_NAME} oneAPI +======= + zypper addrepo https://yum.repos.intel.com/oneapi oneAPI +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) rpm --import https://yum.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB # The xpu-smi packages @@ -147,10 +165,17 @@ if [[ "${XPU_DRIVER_TYPE,,}" == "rolling" ]]; then XPU_DRIVER_VERSION="" fi +<<<<<<< HEAD XPU_REPO_NAME="intel-for-pytorch-gpu-dev" XPU_PACKAGES="intel-for-pytorch-gpu-dev-0.5 intel-pti-dev-0.9" if [[ "$XPU_VERSION" == "2025.0" ]]; then XPU_REPO_NAME="oneapi" +======= +# Default use Intel® oneAPI Deep Learning Essentials 2025.0 +if [[ "$XPU_VERSION" == "2025.1" ]]; then + XPU_PACKAGES="intel-deep-learning-essentials-2025.1" +else +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) XPU_PACKAGES="intel-deep-learning-essentials-2025.0" fi diff --git a/.ci/docker/libtorch/Dockerfile b/.ci/docker/libtorch/Dockerfile index b83071b25aa5..34ee4426ef15 100644 --- a/.ci/docker/libtorch/Dockerfile +++ b/.ci/docker/libtorch/Dockerfile @@ -49,6 +49,7 @@ RUN bash ./install_mkl.sh && rm install_mkl.sh FROM cpu as cuda ADD ./common/install_cuda.sh install_cuda.sh ADD ./common/install_magma.sh install_magma.sh +<<<<<<< HEAD ENV CUDA_HOME /usr/local/cuda FROM cuda as cuda11.8 @@ -61,6 +62,13 @@ RUN bash ./install_cuda.sh 12.4 RUN bash ./install_magma.sh 12.4 RUN ln -sf /usr/local/cuda-12.4 /usr/local/cuda +======= +COPY ./common/install_nccl.sh install_nccl.sh +COPY ./ci_commit_pins/nccl-cu* /ci_commit_pins/ +COPY ./common/install_cusparselt.sh install_cusparselt.sh +ENV CUDA_HOME /usr/local/cuda + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) FROM cuda as cuda12.6 RUN bash ./install_cuda.sh 12.6 RUN bash ./install_magma.sh 12.6 @@ -71,7 +79,17 @@ RUN bash ./install_cuda.sh 12.8 RUN bash ./install_magma.sh 12.8 RUN ln -sf /usr/local/cuda-12.8 /usr/local/cuda +<<<<<<< HEAD +FROM cpu as rocm +======= +FROM cuda as cuda12.9 +RUN bash ./install_cuda.sh 12.9 +RUN bash ./install_magma.sh 12.9 +RUN ln -sf /usr/local/cuda-12.9 /usr/local/cuda + FROM cpu as rocm +ARG ROCM_VERSION +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ARG PYTORCH_ROCM_ARCH ENV PYTORCH_ROCM_ARCH ${PYTORCH_ROCM_ARCH} ENV MKLROOT /opt/intel @@ -86,11 +104,19 @@ ADD ./common/install_rocm_magma.sh install_rocm_magma.sh # gfortran and python needed for building magma from source for ROCm RUN apt-get update -y && \ apt-get install gfortran -y && \ +<<<<<<< HEAD apt-get install python -y && \ apt-get clean RUN bash ./install_rocm_drm.sh && rm install_rocm_drm.sh RUN bash ./install_rocm_magma.sh && rm install_rocm_magma.sh +======= + apt-get install python3 python-is-python3 -y && \ + apt-get clean + +RUN bash ./install_rocm_drm.sh && rm install_rocm_drm.sh +RUN bash ./install_rocm_magma.sh ${ROCM_VERSION} && rm install_rocm_magma.sh +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) FROM ${BASE_TARGET} as final COPY --from=openssl /opt/openssl /opt/openssl diff --git a/.ci/docker/libtorch/build.sh b/.ci/docker/libtorch/build.sh index fd9932f8def8..9931d01ce866 100755 --- a/.ci/docker/libtorch/build.sh +++ b/.ci/docker/libtorch/build.sh @@ -1,12 +1,17 @@ #!/usr/bin/env bash # Script used only in CD pipeline +<<<<<<< HEAD set -eou pipefail +======= +set -eoux pipefail +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) image="$1" shift if [ -z "${image}" ]; then +<<<<<<< HEAD echo "Usage: $0 IMAGE" exit 1 fi @@ -44,10 +49,52 @@ case ${GPU_ARCH_TYPE} in ;; *) echo "ERROR: Unrecognized GPU_ARCH_TYPE: ${GPU_ARCH_TYPE}" +======= + echo "Usage: $0 IMAGENAME:ARCHTAG" + exit 1 +fi + +TOPDIR=$(git rev-parse --show-toplevel) + +DOCKER=${DOCKER:-docker} + +# Go from imagename:tag to tag +DOCKER_TAG_PREFIX=$(echo "${image}" | awk -F':' '{print $2}') + +GPU_ARCH_VERSION="" +if [[ "${DOCKER_TAG_PREFIX}" == cuda* ]]; then + # extract cuda version from image name. e.g. manylinux2_28-builder:cuda12.8 returns 12.8 + GPU_ARCH_VERSION=$(echo "${DOCKER_TAG_PREFIX}" | awk -F'cuda' '{print $2}') +elif [[ "${DOCKER_TAG_PREFIX}" == rocm* ]]; then + # extract rocm version from image name. e.g. manylinux2_28-builder:rocm6.2.4 returns 6.2.4 + GPU_ARCH_VERSION=$(echo "${DOCKER_TAG_PREFIX}" | awk -F'rocm' '{print $2}') +fi + +case ${DOCKER_TAG_PREFIX} in + cpu) + BASE_TARGET=cpu + GPU_IMAGE=ubuntu:20.04 + DOCKER_GPU_BUILD_ARG="" + ;; + cuda*) + BASE_TARGET=cuda${GPU_ARCH_VERSION} + GPU_IMAGE=ubuntu:20.04 + DOCKER_GPU_BUILD_ARG="" + ;; + rocm*) + BASE_TARGET=rocm + GPU_IMAGE=rocm/dev-ubuntu-22.04:${GPU_ARCH_VERSION}-complete + PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201" + DOCKER_GPU_BUILD_ARG="--build-arg PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH} --build-arg ROCM_VERSION=${GPU_ARCH_VERSION}" + ;; + *) + echo "ERROR: Unrecognized DOCKER_TAG_PREFIX: ${DOCKER_TAG_PREFIX}" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) exit 1 ;; esac +<<<<<<< HEAD ( set -x @@ -81,3 +128,16 @@ if [[ "${WITH_PUSH}" == true ]]; then fi ) fi +======= +tmp_tag=$(basename "$(mktemp -u)" | tr '[:upper:]' '[:lower:]') + +DOCKER_BUILDKIT=1 ${DOCKER} build \ + --target final \ + ${DOCKER_GPU_BUILD_ARG} \ + --build-arg "GPU_IMAGE=${GPU_IMAGE}" \ + --build-arg "BASE_TARGET=${BASE_TARGET}" \ + -t "${tmp_tag}" \ + $@ \ + -f "${TOPDIR}/.ci/docker/libtorch/Dockerfile" \ + "${TOPDIR}/.ci/docker/" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) diff --git a/.ci/docker/linter-cuda/Dockerfile b/.ci/docker/linter-cuda/Dockerfile index 8084bf627124..a8db2f257ac9 100644 --- a/.ci/docker/linter-cuda/Dockerfile +++ b/.ci/docker/linter-cuda/Dockerfile @@ -18,6 +18,7 @@ COPY ./common/install_user.sh install_user.sh RUN bash ./install_user.sh && rm install_user.sh # Install conda and other packages (e.g., numpy, pytest) +<<<<<<< HEAD ARG ANACONDA_PYTHON_VERSION ARG CONDA_CMAKE ENV ANACONDA_PYTHON_VERSION=$ANACONDA_PYTHON_VERSION @@ -27,19 +28,43 @@ COPY ./common/install_conda.sh install_conda.sh COPY ./common/common_utils.sh common_utils.sh COPY ./common/install_magma_conda.sh install_magma_conda.sh RUN bash ./install_conda.sh && rm install_conda.sh install_magma_conda.sh common_utils.sh /opt/conda/requirements-ci.txt +======= +ARG PYTHON_VERSION +ARG PIP_CMAKE +# Put venv into the env vars so users don't need to activate it +ENV PATH /var/lib/jenkins/ci_env/bin:$PATH +ENV VIRTUAL_ENV /var/lib/jenkins/ci_env +COPY requirements-ci.txt /opt/requirements-ci.txt +COPY ./common/install_python.sh install_python.sh +RUN bash ./install_python.sh && rm install_python.sh /opt/requirements-ci.txt +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # Install cuda and cudnn ARG CUDA_VERSION COPY ./common/install_cuda.sh install_cuda.sh +<<<<<<< HEAD RUN bash ./install_cuda.sh ${CUDA_VERSION} && rm install_cuda.sh +======= +COPY ./common/install_nccl.sh install_nccl.sh +COPY ./ci_commit_pins/nccl-cu* /ci_commit_pins/ +COPY ./common/install_cusparselt.sh install_cusparselt.sh +RUN bash ./install_cuda.sh ${CUDA_VERSION} && rm install_cuda.sh install_nccl.sh /ci_commit_pins/nccl-cu* install_cusparselt.sh +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ENV DESIRED_CUDA ${CUDA_VERSION} ENV PATH /usr/local/nvidia/bin:/usr/local/cuda/bin:$PATH # Note that Docker build forbids copying file outside the build context COPY ./common/install_linter.sh install_linter.sh +<<<<<<< HEAD COPY ./common/common_utils.sh common_utils.sh RUN bash ./install_linter.sh RUN rm install_linter.sh common_utils.sh +======= +RUN bash ./install_linter.sh +RUN rm install_linter.sh + +RUN chown -R jenkins:jenkins /var/lib/jenkins/ci_env +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) USER jenkins CMD ["bash"] diff --git a/.ci/docker/linter/Dockerfile b/.ci/docker/linter/Dockerfile index 968918a3617c..607e8839559f 100644 --- a/.ci/docker/linter/Dockerfile +++ b/.ci/docker/linter/Dockerfile @@ -15,6 +15,7 @@ COPY ./common/install_user.sh install_user.sh RUN bash ./install_user.sh && rm install_user.sh # Install conda and other packages (e.g., numpy, pytest) +<<<<<<< HEAD ARG ANACONDA_PYTHON_VERSION ARG CONDA_CMAKE ENV ANACONDA_PYTHON_VERSION=$ANACONDA_PYTHON_VERSION @@ -29,6 +30,19 @@ COPY ./common/install_linter.sh install_linter.sh COPY ./common/common_utils.sh common_utils.sh RUN bash ./install_linter.sh RUN rm install_linter.sh common_utils.sh +======= +ARG PYTHON_VERSION +ENV PATH /var/lib/jenkins/ci_env/bin:$PATH +ENV VIRTUAL_ENV /var/lib/jenkins/ci_env +COPY requirements-ci.txt /opt/requirements-ci.txt +COPY ./common/install_python.sh install_python.sh +RUN bash ./install_python.sh && rm install_python.sh /opt/requirements-ci.txt + +# Note that Docker build forbids copying file outside the build context +COPY ./common/install_linter.sh install_linter.sh +RUN bash ./install_linter.sh +RUN rm install_linter.sh +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) USER jenkins CMD ["bash"] diff --git a/.ci/docker/manywheel/Dockerfile_2_28 b/.ci/docker/manywheel/Dockerfile_2_28 index e63657f391b3..21448786a7b7 100644 --- a/.ci/docker/manywheel/Dockerfile_2_28 +++ b/.ci/docker/manywheel/Dockerfile_2_28 @@ -7,8 +7,13 @@ ENV LC_ALL en_US.UTF-8 ENV LANG en_US.UTF-8 ENV LANGUAGE en_US.UTF-8 +<<<<<<< HEAD ARG DEVTOOLSET_VERSION=11 RUN yum install -y sudo wget curl perl util-linux xz bzip2 git patch which perl zlib-devel yum-utils gcc-toolset-${DEVTOOLSET_VERSION}-toolchain +======= +ARG DEVTOOLSET_VERSION=13 +RUN yum install -y sudo wget curl perl util-linux xz bzip2 git patch which perl zlib-devel yum-utils gcc-toolset-${DEVTOOLSET_VERSION}-gcc gcc-toolset-${DEVTOOLSET_VERSION}-gcc-c++ gcc-toolset-${DEVTOOLSET_VERSION}-gcc-gfortran gcc-toolset-${DEVTOOLSET_VERSION}-gdb +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ENV PATH=/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/bin:$PATH ENV LD_LIBRARY_PATH=/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/lib64:/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/lib:$LD_LIBRARY_PATH @@ -26,17 +31,31 @@ ADD ./common/install_openssl.sh install_openssl.sh RUN bash ./install_openssl.sh && rm install_openssl.sh +<<<<<<< HEAD # remove unncessary python versions +======= +# remove unnecessary python versions +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) RUN rm -rf /opt/python/cp26-cp26m /opt/_internal/cpython-2.6.9-ucs2 RUN rm -rf /opt/python/cp26-cp26mu /opt/_internal/cpython-2.6.9-ucs4 RUN rm -rf /opt/python/cp33-cp33m /opt/_internal/cpython-3.3.6 RUN rm -rf /opt/python/cp34-cp34m /opt/_internal/cpython-3.4.6 FROM base as cuda +<<<<<<< HEAD ARG BASE_CUDA_VERSION=11.8 # Install CUDA ADD ./common/install_cuda.sh install_cuda.sh RUN bash ./install_cuda.sh ${BASE_CUDA_VERSION} && rm install_cuda.sh +======= +ARG BASE_CUDA_VERSION=12.6 +# Install CUDA +ADD ./common/install_cuda.sh install_cuda.sh +COPY ./common/install_nccl.sh install_nccl.sh +COPY ./ci_commit_pins/nccl-cu* /ci_commit_pins/ +COPY ./common/install_cusparselt.sh install_cusparselt.sh +RUN bash ./install_cuda.sh ${BASE_CUDA_VERSION} && rm install_cuda.sh install_nccl.sh ci_commit_pins/nccl-cu* install_cusparselt.sh +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) FROM base as intel # MKL @@ -44,7 +63,11 @@ ADD ./common/install_mkl.sh install_mkl.sh RUN bash ./install_mkl.sh && rm install_mkl.sh FROM base as magma +<<<<<<< HEAD ARG BASE_CUDA_VERSION=10.2 +======= +ARG BASE_CUDA_VERSION=12.6 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # Install magma ADD ./common/install_magma.sh install_magma.sh RUN bash ./install_magma.sh ${BASE_CUDA_VERSION} && rm install_magma.sh @@ -61,7 +84,11 @@ ADD ./common/install_libpng.sh install_libpng.sh RUN bash ./install_libpng.sh && rm install_libpng.sh FROM ${GPU_IMAGE} as common +<<<<<<< HEAD ARG DEVTOOLSET_VERSION=11 +======= +ARG DEVTOOLSET_VERSION=13 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ENV LC_ALL en_US.UTF-8 ENV LANG en_US.UTF-8 ENV LANGUAGE en_US.UTF-8 @@ -84,6 +111,7 @@ RUN yum install -y \ wget \ which \ xz \ +<<<<<<< HEAD gcc-toolset-${DEVTOOLSET_VERSION}-toolchain \ glibc-langpack-en RUN yum install -y \ @@ -91,6 +119,14 @@ RUN yum install -y \ https://ossci-linux.s3.amazonaws.com/epel-release-7-14.noarch.rpm RUN yum swap -y git git236-core +======= + glibc-langpack-en \ + gcc-toolset-${DEVTOOLSET_VERSION}-gcc \ + gcc-toolset-${DEVTOOLSET_VERSION}-gcc-c++ \ + gcc-toolset-${DEVTOOLSET_VERSION}-gcc-gfortran \ + gcc-toolset-${DEVTOOLSET_VERSION}-gdb + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # git236+ would refuse to run git commands in repos owned by other users # Which causes version check to fail, as pytorch repo is bind-mounted into the image # Override this behaviour by treating every folder as safe @@ -101,6 +137,10 @@ ENV SSL_CERT_FILE=/opt/_internal/certs.pem # Install LLVM version COPY --from=openssl /opt/openssl /opt/openssl COPY --from=base /opt/python /opt/python +<<<<<<< HEAD +======= +COPY --from=base /usr/local/lib/ /usr/local/lib/ +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) COPY --from=base /opt/_internal /opt/_internal COPY --from=base /usr/local/bin/auditwheel /usr/local/bin/auditwheel COPY --from=intel /opt/intel /opt/intel @@ -114,8 +154,13 @@ COPY --from=libpng /usr/local/lib/pkgconfig /usr/local/ COPY --from=jni /usr/local/include/jni.h /usr/local/include/jni.h FROM common as cpu_final +<<<<<<< HEAD ARG BASE_CUDA_VERSION=11.8 ARG DEVTOOLSET_VERSION=11 +======= +ARG BASE_CUDA_VERSION=12.6 +ARG DEVTOOLSET_VERSION=13 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # Install Anaconda ADD ./common/install_conda_docker.sh install_conda.sh RUN bash ./install_conda.sh && rm install_conda.sh @@ -154,11 +199,22 @@ ENV ROCM_PATH /opt/rocm # and avoid 3.21.0 cmake+ninja issues with ninja inserting "-Wl,--no-as-needed" in LINK_FLAGS for static linker RUN python3 -m pip install --upgrade pip && \ python3 -mpip install cmake==3.28.4 +<<<<<<< HEAD ADD ./common/install_rocm_drm.sh install_rocm_drm.sh RUN bash ./install_rocm_drm.sh && rm install_rocm_drm.sh ENV MKLROOT /opt/intel ADD ./common/install_rocm_magma.sh install_rocm_magma.sh RUN bash ./install_rocm_magma.sh && rm install_rocm_magma.sh +======= +# replace the libdrm in /opt/amdgpu with custom amdgpu.ids lookup path +ADD ./common/install_rocm_drm.sh install_rocm_drm.sh +RUN bash ./install_rocm_drm.sh && rm install_rocm_drm.sh +# ROCm 6.4 rocm-smi depends on system drm.h header +RUN yum install -y libdrm-devel +ENV MKLROOT /opt/intel +ADD ./common/install_rocm_magma.sh install_rocm_magma.sh +RUN bash ./install_rocm_magma.sh ${ROCM_VERSION} && rm install_rocm_magma.sh +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ADD ./common/install_miopen.sh install_miopen.sh RUN bash ./install_miopen.sh ${ROCM_VERSION} && rm install_miopen.sh @@ -169,6 +225,10 @@ ENV XPU_DRIVER_TYPE ROLLING RUN python3 -m pip install --upgrade pip && \ python3 -mpip install cmake==3.28.4 ADD ./common/install_xpu.sh install_xpu.sh +<<<<<<< HEAD ENV XPU_VERSION 2025.0 +======= +ENV XPU_VERSION 2025.1 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) RUN bash ./install_xpu.sh && rm install_xpu.sh RUN pushd /opt/_internal && tar -xJf static-libs-for-embedding-only.tar.xz && popd diff --git a/.ci/docker/manywheel/Dockerfile_2_28_aarch64 b/.ci/docker/manywheel/Dockerfile_2_28_aarch64 index 8f5d4c3361ce..35faa9bf2605 100644 --- a/.ci/docker/manywheel/Dockerfile_2_28_aarch64 +++ b/.ci/docker/manywheel/Dockerfile_2_28_aarch64 @@ -1,9 +1,15 @@ FROM quay.io/pypa/manylinux_2_28_aarch64 as base +<<<<<<< HEAD # Graviton needs GCC 10 or above for the build. GCC12 is the default version in almalinux-8. ARG GCCTOOLSET_VERSION=11 # Language variabes +======= +ARG GCCTOOLSET_VERSION=13 + +# Language variables +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ENV LC_ALL=en_US.UTF-8 ENV LANG=en_US.UTF-8 ENV LANGUAGE=en_US.UTF-8 @@ -36,7 +42,14 @@ RUN yum install -y \ yasm \ zstd \ sudo \ +<<<<<<< HEAD gcc-toolset-${GCCTOOLSET_VERSION}-toolchain +======= + gcc-toolset-${GCCTOOLSET_VERSION}-gcc \ + gcc-toolset-${GCCTOOLSET_VERSION}-gcc-c++ \ + gcc-toolset-${GCCTOOLSET_VERSION}-gcc-gfortran \ + gcc-toolset-${GCCTOOLSET_VERSION}-gdb +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # (optional) Install non-default Ninja version ARG NINJA_VERSION @@ -56,12 +69,20 @@ RUN git config --global --add safe.directory "*" FROM base as openblas # Install openblas +<<<<<<< HEAD +======= +ARG OPENBLAS_VERSION +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ADD ./common/install_openblas.sh install_openblas.sh RUN bash ./install_openblas.sh && rm install_openblas.sh FROM base as final +<<<<<<< HEAD # remove unncessary python versions +======= +# remove unnecessary python versions +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) RUN rm -rf /opt/python/cp26-cp26m /opt/_internal/cpython-2.6.9-ucs2 RUN rm -rf /opt/python/cp26-cp26mu /opt/_internal/cpython-2.6.9-ucs4 RUN rm -rf /opt/python/cp33-cp33m /opt/_internal/cpython-3.3.6 diff --git a/.ci/docker/manywheel/Dockerfile_cuda_aarch64 b/.ci/docker/manywheel/Dockerfile_cuda_aarch64 index dfd766b4dd5a..654e50f11caa 100644 --- a/.ci/docker/manywheel/Dockerfile_cuda_aarch64 +++ b/.ci/docker/manywheel/Dockerfile_cuda_aarch64 @@ -1,7 +1,11 @@ FROM quay.io/pypa/manylinux_2_28_aarch64 as base # Cuda ARM build needs gcc 11 +<<<<<<< HEAD ARG DEVTOOLSET_VERSION=11 +======= +ARG DEVTOOLSET_VERSION=13 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # Language variables ENV LC_ALL=en_US.UTF-8 @@ -34,7 +38,14 @@ RUN yum install -y \ zstd \ libgomp \ sudo \ +<<<<<<< HEAD gcc-toolset-${DEVTOOLSET_VERSION}-toolchain +======= + gcc-toolset-${DEVTOOLSET_VERSION}-gcc \ + gcc-toolset-${DEVTOOLSET_VERSION}-gcc-c++ \ + gcc-toolset-${DEVTOOLSET_VERSION}-gcc-gfortran \ + gcc-toolset-${DEVTOOLSET_VERSION}-gdb +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # Ensure the expected devtoolset is used ENV PATH=/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/bin:$PATH @@ -57,7 +68,11 @@ RUN bash ./install_openssl.sh && rm install_openssl.sh ENV SSL_CERT_FILE=/opt/_internal/certs.pem FROM openssl as final +<<<<<<< HEAD # remove unncessary python versions +======= +# remove unnecessary python versions +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) RUN rm -rf /opt/python/cp26-cp26m /opt/_internal/cpython-2.6.9-ucs2 RUN rm -rf /opt/python/cp26-cp26mu /opt/_internal/cpython-2.6.9-ucs4 RUN rm -rf /opt/python/cp33-cp33m /opt/_internal/cpython-3.3.6 @@ -66,8 +81,16 @@ RUN rm -rf /opt/python/cp34-cp34m /opt/_internal/cpython-3.4.6 FROM base as cuda ARG BASE_CUDA_VERSION # Install CUDA +<<<<<<< HEAD ADD ./common/install_cuda_aarch64.sh install_cuda_aarch64.sh RUN bash ./install_cuda_aarch64.sh ${BASE_CUDA_VERSION} && rm install_cuda_aarch64.sh +======= +ADD ./common/install_cuda.sh install_cuda.sh +COPY ./common/install_nccl.sh install_nccl.sh +COPY ./common/install_cusparselt.sh install_cusparselt.sh +COPY ./ci_commit_pins/nccl-cu* /ci_commit_pins/ +RUN bash ./install_cuda.sh ${BASE_CUDA_VERSION} && rm install_cuda.sh install_nccl.sh ci_commit_pins/nccl-cu* install_cusparselt.sh +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) FROM base as magma ARG BASE_CUDA_VERSION diff --git a/.ci/docker/manywheel/Dockerfile_s390x b/.ci/docker/manywheel/Dockerfile_s390x index 63a6a67c28ce..a9fc14ad1164 100644 --- a/.ci/docker/manywheel/Dockerfile_s390x +++ b/.ci/docker/manywheel/Dockerfile_s390x @@ -5,7 +5,13 @@ ENV LC_ALL=C.UTF-8 ENV LANG=C.UTF-8 ENV LANGUAGE=C.UTF-8 +<<<<<<< HEAD ARG DEVTOOLSET_VERSION=13 +======= +# there is a bugfix in gcc >= 14 for precompiled headers and s390x vectorization interaction. +# with earlier gcc versions test/inductor/test_cpu_cpp_wrapper.py will fail. +ARG DEVTOOLSET_VERSION=14 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # Installed needed OS packages. This is to support all # the binary builds (torch, vision, audio, text, data) RUN yum -y install epel-release @@ -42,6 +48,10 @@ RUN yum install -y \ llvm-devel \ libzstd-devel \ python3.12-devel \ +<<<<<<< HEAD +======= + python3.12-test \ +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) python3.12-setuptools \ python3.12-pip \ python3-virtualenv \ @@ -57,7 +67,12 @@ RUN yum install -y \ libxslt-devel \ libxml2-devel \ openssl-devel \ +<<<<<<< HEAD valgrind +======= + valgrind \ + ninja-build +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ENV PATH=/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/bin:$PATH ENV LD_LIBRARY_PATH=/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/lib64:/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/lib:$LD_LIBRARY_PATH @@ -101,6 +116,7 @@ CMD ["/bin/bash"] # install test dependencies: # - grpcio requires system openssl, bundled crypto fails to build +<<<<<<< HEAD # - ml_dtypes 0.4.0 requires some fixes provided in later commits to build RUN dnf install -y \ protobuf-devel \ @@ -122,3 +138,37 @@ RUN cd ~ && \ python3 setup.py bdist_wheel && \ pip3 install dist/*.whl && \ rm -rf ml_dtypes +======= +RUN dnf install -y \ + hdf5-devel \ + python3-h5py \ + git + +RUN env GRPC_PYTHON_BUILD_SYSTEM_OPENSSL=True pip3 install grpcio + +# cmake-3.28.0 from pip for onnxruntime +RUN python3 -mpip install cmake==3.28.0 + +# build onnxruntime 1.21.0 from sources. +# it is not possible to build it from sources using pip, +# so just build it from upstream repository. +# h5py is dependency of onnxruntime_training. +# h5py==3.11.0 builds with hdf5-devel 1.10.5 from repository. +# h5py 3.11.0 doesn't build with numpy >= 2.3.0. +# install newest flatbuffers version first: +# for some reason old version is getting pulled in otherwise. +# packaging package is required for onnxruntime wheel build. +RUN pip3 install flatbuffers && \ + pip3 install cython 'pkgconfig>=1.5.5' 'setuptools>=77' 'numpy<2.3.0' && \ + pip3 install --no-build-isolation h5py==3.11.0 && \ + pip3 install packaging && \ + git clone https://github.com/microsoft/onnxruntime && \ + cd onnxruntime && git checkout v1.21.0 && \ + git submodule update --init --recursive && \ + ./build.sh --config Release --parallel 0 --enable_pybind \ + --build_wheel --enable_training --enable_training_apis \ + --enable_training_ops --skip_tests --allow_running_as_root \ + --compile_no_warning_as_error && \ + pip3 install ./build/Linux/Release/dist/onnxruntime_training-*.whl && \ + cd .. && /bin/rm -rf ./onnxruntime +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) diff --git a/.ci/docker/manywheel/build.sh b/.ci/docker/manywheel/build.sh index 0601d7605d84..dfab41d4f373 100755 --- a/.ci/docker/manywheel/build.sh +++ b/.ci/docker/manywheel/build.sh @@ -1,7 +1,11 @@ #!/usr/bin/env bash # Script used only in CD pipeline +<<<<<<< HEAD set -eou pipefail +======= +set -exou pipefail +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) TOPDIR=$(git rev-parse --show-toplevel) @@ -9,6 +13,7 @@ image="$1" shift if [ -z "${image}" ]; then +<<<<<<< HEAD echo "Usage: $0 IMAGE" exit 1 fi @@ -54,17 +59,61 @@ case ${GPU_ARCH_TYPE} in cpu-cxx11-abi) TARGET=final DOCKER_TAG=cpu-cxx11-abi +======= + echo "Usage: $0 IMAGE:ARCHTAG" + exit 1 +fi + +# Go from imagename:tag to tag +DOCKER_TAG_PREFIX=$(echo "${image}" | awk -F':' '{print $2}') + +GPU_ARCH_VERSION="" +if [[ "${DOCKER_TAG_PREFIX}" == cuda* ]]; then + # extract cuda version from image name. e.g. manylinux2_28-builder:cuda12.8 returns 12.8 + GPU_ARCH_VERSION=$(echo "${DOCKER_TAG_PREFIX}" | awk -F'cuda' '{print $2}') +elif [[ "${DOCKER_TAG_PREFIX}" == rocm* ]]; then + # extract rocm version from image name. e.g. manylinux2_28-builder:rocm6.2.4 returns 6.2.4 + GPU_ARCH_VERSION=$(echo "${DOCKER_TAG_PREFIX}" | awk -F'rocm' '{print $2}') +fi + +MANY_LINUX_VERSION=${MANY_LINUX_VERSION:-} +DOCKERFILE_SUFFIX=${DOCKERFILE_SUFFIX:-} +OPENBLAS_VERSION=${OPENBLAS_VERSION:-} + +case ${image} in + manylinux2_28-builder:cpu) + TARGET=cpu_final + GPU_IMAGE=amd64/almalinux:8 + DOCKER_GPU_BUILD_ARG=" --build-arg DEVTOOLSET_VERSION=13" + MANY_LINUX_VERSION="2_28" + ;; + manylinux2_28_aarch64-builder:cpu-aarch64) + TARGET=final + GPU_IMAGE=arm64v8/almalinux:8 + DOCKER_GPU_BUILD_ARG=" --build-arg DEVTOOLSET_VERSION=13 --build-arg NINJA_VERSION=1.12.1" + MANY_LINUX_VERSION="2_28_aarch64" + OPENBLAS_VERSION="v0.3.30" + ;; + manylinuxcxx11-abi-builder:cpu-cxx11-abi) + TARGET=final +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) GPU_IMAGE="" DOCKER_GPU_BUILD_ARG=" --build-arg DEVTOOLSET_VERSION=9" MANY_LINUX_VERSION="cxx11-abi" ;; +<<<<<<< HEAD cpu-s390x) TARGET=final DOCKER_TAG=cpu-s390x +======= + manylinuxs390x-builder:cpu-s390x) + TARGET=final +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) GPU_IMAGE=s390x/almalinux:8 DOCKER_GPU_BUILD_ARG="" MANY_LINUX_VERSION="s390x" ;; +<<<<<<< HEAD cuda) TARGET=cuda_final DOCKER_TAG=cuda${GPU_ARCH_VERSION} @@ -75,10 +124,15 @@ case ${GPU_ARCH_TYPE} in cuda-manylinux_2_28) TARGET=cuda_final DOCKER_TAG=cuda${GPU_ARCH_VERSION} +======= + manylinux2_28-builder:cuda11*) + TARGET=cuda_final +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) GPU_IMAGE=amd64/almalinux:8 DOCKER_GPU_BUILD_ARG="--build-arg BASE_CUDA_VERSION=${GPU_ARCH_VERSION} --build-arg DEVTOOLSET_VERSION=11" MANY_LINUX_VERSION="2_28" ;; +<<<<<<< HEAD cuda-aarch64) TARGET=cuda_final DOCKER_TAG=cuda${GPU_ARCH_VERSION} @@ -103,16 +157,46 @@ case ${GPU_ARCH_TYPE} in xpu) TARGET=xpu_final DOCKER_TAG=xpu +======= + manylinux2_28-builder:cuda12*) + TARGET=cuda_final + GPU_IMAGE=amd64/almalinux:8 + DOCKER_GPU_BUILD_ARG="--build-arg BASE_CUDA_VERSION=${GPU_ARCH_VERSION} --build-arg DEVTOOLSET_VERSION=13" + MANY_LINUX_VERSION="2_28" + ;; + manylinuxaarch64-builder:cuda*) + TARGET=cuda_final + GPU_IMAGE=amd64/almalinux:8 + DOCKER_GPU_BUILD_ARG="--build-arg BASE_CUDA_VERSION=${GPU_ARCH_VERSION} --build-arg DEVTOOLSET_VERSION=13" + MANY_LINUX_VERSION="aarch64" + DOCKERFILE_SUFFIX="_cuda_aarch64" + ;; + manylinux2_28-builder:rocm*) + TARGET=rocm_final + MANY_LINUX_VERSION="2_28" + DEVTOOLSET_VERSION="11" + GPU_IMAGE=rocm/dev-almalinux-8:${GPU_ARCH_VERSION}-complete + PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201" + DOCKER_GPU_BUILD_ARG="--build-arg ROCM_VERSION=${GPU_ARCH_VERSION} --build-arg PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH} --build-arg DEVTOOLSET_VERSION=${DEVTOOLSET_VERSION}" + ;; + manylinux2_28-builder:xpu) + TARGET=xpu_final +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) GPU_IMAGE=amd64/almalinux:8 DOCKER_GPU_BUILD_ARG=" --build-arg DEVTOOLSET_VERSION=11" MANY_LINUX_VERSION="2_28" ;; *) +<<<<<<< HEAD echo "ERROR: Unrecognized GPU_ARCH_TYPE: ${GPU_ARCH_TYPE}" +======= + echo "ERROR: Unrecognized image name: ${image}" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) exit 1 ;; esac +<<<<<<< HEAD IMAGES='' if [[ -n ${MANY_LINUX_VERSION} && -z ${DOCKERFILE_SUFFIX} ]]; then @@ -158,3 +242,28 @@ if [[ "${WITH_PUSH}" == true ]]; then fi ) fi +======= +if [[ -n ${MANY_LINUX_VERSION} && -z ${DOCKERFILE_SUFFIX} ]]; then + DOCKERFILE_SUFFIX=_${MANY_LINUX_VERSION} +fi +# Only activate this if in CI +if [ "$(uname -m)" != "s390x" ] && [ -v CI ]; then + # TODO: Remove LimitNOFILE=1048576 patch once https://github.com/pytorch/test-infra/issues/5712 + # is resolved. This patch is required in order to fix timing out of Docker build on Amazon Linux 2023. + sudo sed -i s/LimitNOFILE=infinity/LimitNOFILE=1048576/ /usr/lib/systemd/system/docker.service + sudo systemctl daemon-reload + sudo systemctl restart docker +fi + +tmp_tag=$(basename "$(mktemp -u)" | tr '[:upper:]' '[:lower:]') + +DOCKER_BUILDKIT=1 docker build \ + ${DOCKER_GPU_BUILD_ARG} \ + --build-arg "GPU_IMAGE=${GPU_IMAGE}" \ + --build-arg "OPENBLAS_VERSION=${OPENBLAS_VERSION}" \ + --target "${TARGET}" \ + -t "${tmp_tag}" \ + $@ \ + -f "${TOPDIR}/.ci/docker/manywheel/Dockerfile${DOCKERFILE_SUFFIX}" \ + "${TOPDIR}/.ci/docker/" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) diff --git a/.ci/docker/manywheel/build_scripts/build.sh b/.ci/docker/manywheel/build_scripts/build.sh index e2cb1c7f27cd..75610a6fa94e 100644 --- a/.ci/docker/manywheel/build_scripts/build.sh +++ b/.ci/docker/manywheel/build_scripts/build.sh @@ -97,7 +97,11 @@ find /opt/_internal -type f -print0 \ | xargs -0 -n1 strip --strip-unneeded 2>/dev/null || true # We do not need the Python test suites, or indeed the precompiled .pyc and # .pyo files. Partially cribbed from: +<<<<<<< HEAD # https://github.com/docker-library/python/blob/master/3.4/slim/Dockerfile +======= +# https://github.com/docker-library/python/blob/master/3.4/slim/Dockerfile # @lint-ignore +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) find /opt/_internal \ \( -type d -a -name test -o -name tests \) \ -o \( -type f -a -name '*.pyc' -o -name '*.pyo' \) \ diff --git a/.ci/docker/manywheel/build_scripts/build_utils.sh b/.ci/docker/manywheel/build_scripts/build_utils.sh index cec871cac4f6..fa7631fb76f6 100755 --- a/.ci/docker/manywheel/build_scripts/build_utils.sh +++ b/.ci/docker/manywheel/build_scripts/build_utils.sh @@ -2,7 +2,11 @@ # Helper utilities for build # Script used only in CD pipeline +<<<<<<< HEAD OPENSSL_DOWNLOAD_URL=https://www.openssl.org/source/old/1.1.1/ +======= +OPENSSL_DOWNLOAD_URL=https://www.openssl.org/source/old/1.1.1/ # @lint-ignore +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) CURL_DOWNLOAD_URL=https://curl.se/download AUTOCONF_DOWNLOAD_URL=https://ftp.gnu.org/gnu/autoconf diff --git a/.ci/docker/requirements-ci.txt b/.ci/docker/requirements-ci.txt index 56abad7aafeb..a1d6751cdee4 100644 --- a/.ci/docker/requirements-ci.txt +++ b/.ci/docker/requirements-ci.txt @@ -16,7 +16,11 @@ click #test that import: coremltools==5.0b5 ; python_version < "3.12" +<<<<<<< HEAD coremltools==7.2 ; python_version == "3.12" +======= +coremltools==8.3 ; python_version == "3.12" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #Description: Apple framework for ML integration #Pinned versions: 5.0b5 #test that import: @@ -42,9 +46,15 @@ fbscribelogger==0.1.7 #Pinned versions: 0.1.6 #test that import: +<<<<<<< HEAD flatbuffers==2.0 #Description: cross platform serialization library #Pinned versions: 2.0 +======= +flatbuffers==24.12.23 +#Description: cross platform serialization library +#Pinned versions: 24.12.23 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #test that import: hypothesis==5.35.1 @@ -92,10 +102,17 @@ librosa==0.10.2 ; python_version == "3.12" #Pinned versions: #test that import: +<<<<<<< HEAD mypy==1.14.0 # Pin MyPy version because new errors are likely to appear with each release #Description: linter #Pinned versions: 1.14.0 +======= +mypy==1.16.0 +# Pin MyPy version because new errors are likely to appear with each release +#Description: linter +#Pinned versions: 1.16.0 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #test that import: test_typing.py, test_type_hints.py networkx==2.8.8 @@ -104,10 +121,17 @@ networkx==2.8.8 #Pinned versions: 2.8.8 #test that import: functorch +<<<<<<< HEAD #ninja #Description: build system. Note that it install from #here breaks things so it is commented out #Pinned versions: 1.10.0.post1 +======= +ninja==1.11.1.3 +#Description: build system. Used in some tests. Used in build to generate build +#time tracing information +#Pinned versions: 1.11.1.3 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #test that import: run_test.py, test_cpp_extensions_aot.py,test_determination.py numba==0.60.0 ; python_version == "3.9" @@ -258,11 +282,14 @@ scipy==1.14.1 ; python_version > "3.9" #Pinned versions: #test that import: +<<<<<<< HEAD tlparse==0.3.30 #Description: parse logs produced by torch.compile #Pinned versions: #test that import: dynamo/test_structured_trace.py +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # needed by torchgen utils typing-extensions>=4.10.0 #Description: type hints for python @@ -340,7 +367,11 @@ onnx==1.18.0 ; python_version == "3.13" #Pinned versions: #test that import: +<<<<<<< HEAD onnxscript==0.2.2 +======= +onnxscript==0.3.1 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #Description: Required by mypy and test_public_bindings.py when checking torch.onnx._internal #Pinned versions: #test that import: @@ -354,7 +385,11 @@ parameterized==0.8.1 #Pinned versions: 1.24.0 #test that import: test_sac_estimator.py +<<<<<<< HEAD pwlf==2.2.1 ; python_version >= "3.8" +======= +pwlf==2.2.1 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #Description: required for testing torch/distributed/_tools/sac_estimator.py #Pinned versions: 2.2.1 #test that import: test_sac_estimator.py @@ -366,10 +401,16 @@ PyYAML pyzstd setuptools +<<<<<<< HEAD ninja==1.11.1 ; platform_machine == "aarch64" scons==4.5.2 ; platform_machine == "aarch64" pulp==2.9.0 ; python_version >= "3.8" +======= +scons==4.5.2 ; platform_machine == "aarch64" + +pulp==2.9.0 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #Description: required for testing ilp formulaiton under torch/distributed/_tools #Pinned versions: 2.9.0 #test that import: test_sac_ilp.py @@ -378,3 +419,16 @@ dataclasses_json==0.6.7 #Description: required for data pipeline and scripts under tools/stats #Pinned versions: 0.6.7 #test that import: +<<<<<<< HEAD +======= + +cmake==4.0.0 +#Description: required for building + +tlparse==0.3.30 +#Description: required for log parsing + +cuda-bindings>=12.0,<13.0 +#Description: required for testing CUDAGraph::raw_cuda_graph(). See https://nvidia.github.io/cuda-python/cuda-bindings/latest/support.html for how this version was chosen. Note "Any fix in the latest bindings would be backported to the prior major version" means that only the newest version of cuda-bindings will get fixes. Depending on the latest version of 12.x is okay because all 12.y versions will be supported via "CUDA minor version compatibility". Pytorch builds against 13.z versions of cuda toolkit work with 12.x versions of cuda-bindings as well because newer drivers work with old toolkits. +#test that import: test_cuda.py +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) diff --git a/.ci/docker/requirements-docs.txt b/.ci/docker/requirements-docs.txt index dcbdb42ee64c..3524240ef2ea 100644 --- a/.ci/docker/requirements-docs.txt +++ b/.ci/docker/requirements-docs.txt @@ -1,15 +1,35 @@ sphinx==5.3.0 #Description: This is used to generate PyTorch docs #Pinned versions: 5.3.0 +<<<<<<< HEAD -e git+https://github.com/pytorch/pytorch_sphinx_theme.git#egg=pytorch_sphinx_theme # TODO: sphinxcontrib.katex 0.9.0 adds a local KaTeX server to speed up pre-rendering # but it doesn't seem to work and hangs around idly. The initial thought is probably # something related to Docker setup. We can investigate this later +======= +-e git+https://github.com/pytorch/pytorch_sphinx_theme.git@pytorch_sphinx_theme2#egg=pytorch_sphinx_theme2 + +# TODO: sphinxcontrib.katex 0.9.0 adds a local KaTeX server to speed up pre-rendering +# but it doesn't seem to work and hangs around idly. The initial thought that it is probably +# something related to Docker setup. We can investigate this later. + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) sphinxcontrib.katex==0.8.6 #Description: This is used to generate PyTorch docs #Pinned versions: 0.8.6 +<<<<<<< HEAD +======= +sphinxext-opengraph==0.9.1 +#Description: This is used to generate PyTorch docs +#Pinned versions: 0.9.1 + +sphinx_sitemap==2.6.0 +#Description: This is used to generate sitemap for PyTorch docs +#Pinned versions: 2.6.0 + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) matplotlib==3.5.3 #Description: This is used to generate PyTorch docs #Pinned versions: 3.5.3 @@ -46,5 +66,10 @@ myst-nb==0.17.2 # The following are required to build torch.distributed.elastic.rendezvous.etcd* docs python-etcd==0.4.5 sphinx-copybutton==0.5.0 +<<<<<<< HEAD sphinx-panels==0.4.1 +======= +sphinx-design==0.4.0 +sphinxcontrib-mermaid==1.0.0 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) myst-parser==0.18.1 diff --git a/.ci/docker/triton_version.txt b/.ci/docker/triton_version.txt index bea438e9ade7..346f470c5c21 100644 --- a/.ci/docker/triton_version.txt +++ b/.ci/docker/triton_version.txt @@ -1 +1,5 @@ +<<<<<<< HEAD 3.3.1 +======= +3.4.0 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) diff --git a/.ci/docker/triton_xpu_version.txt b/.ci/docker/triton_xpu_version.txt new file mode 100644 index 000000000000..18091983f59d --- /dev/null +++ b/.ci/docker/triton_xpu_version.txt @@ -0,0 +1 @@ +3.4.0 diff --git a/.ci/docker/ubuntu-rocm/Dockerfile b/.ci/docker/ubuntu-rocm/Dockerfile index a041ff3a3671..0f1986d91455 100644 --- a/.ci/docker/ubuntu-rocm/Dockerfile +++ b/.ci/docker/ubuntu-rocm/Dockerfile @@ -28,7 +28,10 @@ ARG ANACONDA_PYTHON_VERSION ARG BUILD_ENVIRONMENT ENV ANACONDA_PYTHON_VERSION=$ANACONDA_PYTHON_VERSION ENV PATH /opt/conda/envs/py_$ANACONDA_PYTHON_VERSION/bin:/opt/conda/bin:$PATH +<<<<<<< HEAD ARG CONDA_CMAKE +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) COPY requirements-ci.txt /opt/conda/requirements-ci.txt COPY ./common/install_conda.sh install_conda.sh COPY ./common/common_utils.sh common_utils.sh @@ -44,6 +47,7 @@ ARG CLANG_VERSION COPY ./common/install_clang.sh install_clang.sh RUN bash ./install_clang.sh && rm install_clang.sh +<<<<<<< HEAD # (optional) Install protobuf for ONNX ARG PROTOBUF COPY ./common/install_protobuf.sh install_protobuf.sh @@ -58,6 +62,8 @@ RUN if [ -n "${DB}" ]; then bash ./install_db.sh; fi RUN rm install_db.sh ENV INSTALLED_DB ${DB} +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # (optional) Install vision packages like OpenCV ARG VISION COPY ./common/install_vision.sh ./common/cache_vision_models.sh ./common/common_utils.sh ./ @@ -71,7 +77,11 @@ COPY ./common/install_rocm.sh install_rocm.sh RUN bash ./install_rocm.sh RUN rm install_rocm.sh COPY ./common/install_rocm_magma.sh install_rocm_magma.sh +<<<<<<< HEAD RUN bash ./install_rocm_magma.sh +======= +RUN bash ./install_rocm_magma.sh ${ROCM_VERSION} +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) RUN rm install_rocm_magma.sh ADD ./common/install_miopen.sh install_miopen.sh RUN bash ./install_miopen.sh ${ROCM_VERSION} && rm install_miopen.sh @@ -116,12 +126,15 @@ COPY ci_commit_pins/timm.txt timm.txt RUN if [ -n "${INDUCTOR_BENCHMARKS}" ]; then bash ./install_inductor_benchmark_deps.sh; fi RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface.txt +<<<<<<< HEAD # (optional) Install non-default CMake version ARG CMAKE_VERSION COPY ./common/install_cmake.sh install_cmake.sh RUN if [ -n "${CMAKE_VERSION}" ]; then bash ./install_cmake.sh; fi RUN rm install_cmake.sh +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # (optional) Install non-default Ninja version ARG NINJA_VERSION COPY ./common/install_ninja.sh install_ninja.sh diff --git a/.ci/docker/ubuntu-xpu/Dockerfile b/.ci/docker/ubuntu-xpu/Dockerfile index 41f690c4ab38..d1fc652cf71d 100644 --- a/.ci/docker/ubuntu-xpu/Dockerfile +++ b/.ci/docker/ubuntu-xpu/Dockerfile @@ -28,7 +28,10 @@ RUN bash ./install_docs_reqs.sh && rm install_docs_reqs.sh # Install conda and other packages (e.g., numpy, pytest) ARG ANACONDA_PYTHON_VERSION +<<<<<<< HEAD ARG CONDA_CMAKE +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ARG DOCS ARG BUILD_ENVIRONMENT ENV ANACONDA_PYTHON_VERSION=$ANACONDA_PYTHON_VERSION @@ -73,6 +76,7 @@ ARG TRITON COPY ./common/install_triton.sh install_triton.sh COPY ./common/common_utils.sh common_utils.sh COPY ci_commit_pins/triton-xpu.txt triton-xpu.txt +<<<<<<< HEAD COPY triton_version.txt triton_version.txt RUN if [ -n "${TRITON}" ]; then bash ./install_triton.sh; fi RUN rm install_triton.sh common_utils.sh triton-xpu.txt triton_version.txt @@ -84,6 +88,12 @@ RUN if [ -n "${DB}" ]; then bash ./install_db.sh; fi RUN rm install_db.sh ENV INSTALLED_DB ${DB} +======= +COPY triton_xpu_version.txt triton_version.txt +RUN if [ -n "${TRITON}" ]; then bash ./install_triton.sh; fi +RUN rm install_triton.sh common_utils.sh triton-xpu.txt triton_version.txt + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # (optional) Install vision packages like OpenCV ARG VISION COPY ./common/install_vision.sh ./common/cache_vision_models.sh ./common/common_utils.sh ./ @@ -91,12 +101,15 @@ RUN if [ -n "${VISION}" ]; then bash ./install_vision.sh; fi RUN rm install_vision.sh cache_vision_models.sh common_utils.sh ENV INSTALLED_VISION ${VISION} +<<<<<<< HEAD # (optional) Install non-default CMake version ARG CMAKE_VERSION COPY ./common/install_cmake.sh install_cmake.sh RUN if [ -n "${CMAKE_VERSION}" ]; then bash ./install_cmake.sh; fi RUN rm install_cmake.sh +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # (optional) Install non-default Ninja version ARG NINJA_VERSION COPY ./common/install_ninja.sh install_ninja.sh diff --git a/.ci/docker/ubuntu/Dockerfile b/.ci/docker/ubuntu/Dockerfile index 44bc3b8f2c25..c00d2e38248f 100644 --- a/.ci/docker/ubuntu/Dockerfile +++ b/.ci/docker/ubuntu/Dockerfile @@ -1,6 +1,10 @@ ARG UBUNTU_VERSION +<<<<<<< HEAD FROM ubuntu:${UBUNTU_VERSION} +======= +FROM ubuntu:${UBUNTU_VERSION} as base +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ARG UBUNTU_VERSION @@ -28,7 +32,10 @@ RUN bash ./install_docs_reqs.sh && rm install_docs_reqs.sh # Install conda and other packages (e.g., numpy, pytest) ARG ANACONDA_PYTHON_VERSION +<<<<<<< HEAD ARG CONDA_CMAKE +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ARG DOCS ENV ANACONDA_PYTHON_VERSION=$ANACONDA_PYTHON_VERSION ENV PATH /opt/conda/envs/py_$ANACONDA_PYTHON_VERSION/bin:/opt/conda/bin:$PATH @@ -52,9 +59,23 @@ RUN bash ./install_lcov.sh && rm install_lcov.sh # Install cuda and cudnn ARG CUDA_VERSION COPY ./common/install_cuda.sh install_cuda.sh +<<<<<<< HEAD RUN bash ./install_cuda.sh ${CUDA_VERSION} && rm install_cuda.sh ENV DESIRED_CUDA ${CUDA_VERSION} ENV PATH /usr/local/nvidia/bin:/usr/local/cuda/bin:$PATH +======= +COPY ./common/install_nccl.sh install_nccl.sh +COPY ./ci_commit_pins/nccl-cu* /ci_commit_pins/ +COPY ./common/install_cusparselt.sh install_cusparselt.sh +RUN bash ./install_cuda.sh ${CUDA_VERSION} && rm install_cuda.sh install_nccl.sh /ci_commit_pins/nccl-cu* install_cusparselt.sh +ENV DESIRED_CUDA ${CUDA_VERSION} +ENV PATH /usr/local/nvidia/bin:/usr/local/cuda/bin:$PATH +# No effect if cuda not installed +ENV USE_SYSTEM_NCCL=1 +ENV NCCL_INCLUDE_DIR="/usr/local/cuda/include/" +ENV NCCL_LIB_DIR="/usr/local/cuda/lib64/" + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # (optional) Install UCC ARG UCX_COMMIT @@ -67,6 +88,7 @@ ADD ./common/install_ucc.sh install_ucc.sh RUN if [ -n "${UCX_COMMIT}" ] && [ -n "${UCC_COMMIT}" ]; then bash ./install_ucc.sh; fi RUN rm install_ucc.sh +<<<<<<< HEAD # (optional) Install protobuf for ONNX ARG PROTOBUF COPY ./common/install_protobuf.sh install_protobuf.sh @@ -81,6 +103,8 @@ RUN if [ -n "${DB}" ]; then bash ./install_db.sh; fi RUN rm install_db.sh ENV INSTALLED_DB ${DB} +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # (optional) Install vision packages like OpenCV ARG VISION COPY ./common/install_vision.sh ./common/cache_vision_models.sh ./common/common_utils.sh ./ @@ -88,6 +112,7 @@ RUN if [ -n "${VISION}" ]; then bash ./install_vision.sh; fi RUN rm install_vision.sh cache_vision_models.sh common_utils.sh ENV INSTALLED_VISION ${VISION} +<<<<<<< HEAD # (optional) Install Vulkan SDK ARG VULKAN_SDK_VERSION COPY ./common/install_vulkan_sdk.sh install_vulkan_sdk.sh @@ -106,6 +131,8 @@ COPY ./common/install_cmake.sh install_cmake.sh RUN if [ -n "${CMAKE_VERSION}" ]; then bash ./install_cmake.sh; fi RUN rm install_cmake.sh +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # (optional) Install non-default Ninja version ARG NINJA_VERSION COPY ./common/install_ninja.sh install_ninja.sh @@ -127,6 +154,7 @@ RUN if [ -n "${INDUCTOR_BENCHMARKS}" ]; then bash ./install_inductor_benchmark_d RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface.txt ARG TRITON +<<<<<<< HEAD # Install triton, this needs to be done before sccache because the latter will # try to reach out to S3, which docker build runners don't have access COPY ./common/install_triton.sh install_triton.sh @@ -141,6 +169,23 @@ COPY ./common/common_utils.sh common_utils.sh COPY ci_commit_pins/triton-cpu.txt triton-cpu.txt RUN if [ -n "${TRITON_CPU}" ]; then bash ./install_triton.sh; fi RUN rm install_triton.sh common_utils.sh triton-cpu.txt +======= +ARG TRITON_CPU + +# Create a separate stage for building Triton and Triton-CPU. install_triton +# will check for the presence of env vars +FROM base as triton-builder +COPY ./common/install_triton.sh install_triton.sh +COPY ./common/common_utils.sh common_utils.sh +COPY ci_commit_pins/triton.txt triton.txt +COPY ci_commit_pins/triton-cpu.txt triton-cpu.txt +RUN bash ./install_triton.sh + +FROM base as final +COPY --from=triton-builder /opt/triton /opt/triton +RUN if [ -n "${TRITON}" ] || [ -n "${TRITON_CPU}" ]; then pip install /opt/triton/*.whl; chown -R jenkins:jenkins /opt/conda; fi +RUN rm -rf /opt/triton +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ARG EXECUTORCH # Build and install executorch @@ -171,6 +216,15 @@ RUN if [ -n "${ACL}" ]; then bash ./install_acl.sh; fi RUN rm install_acl.sh ENV INSTALLED_ACL ${ACL} +<<<<<<< HEAD +======= +ARG OPENBLAS +COPY ./common/install_openblas.sh install_openblas.sh +RUN if [ -n "${OPENBLAS}" ]; then bash ./install_openblas.sh; fi +RUN rm install_openblas.sh +ENV INSTALLED_OPENBLAS ${OPENBLAS} + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # Install ccache/sccache (do this last, so we get priority in PATH) ARG SKIP_SCCACHE_INSTALL COPY ./common/install_cache.sh install_cache.sh diff --git a/.ci/magma-rocm/.gitignore b/.ci/magma-rocm/.gitignore new file mode 100644 index 000000000000..6c64316195bc --- /dev/null +++ b/.ci/magma-rocm/.gitignore @@ -0,0 +1,2 @@ +output/ +magma-rocm*/ diff --git a/.ci/magma-rocm/Makefile b/.ci/magma-rocm/Makefile new file mode 100644 index 000000000000..5f63da87bc4d --- /dev/null +++ b/.ci/magma-rocm/Makefile @@ -0,0 +1,35 @@ +SHELL=/usr/bin/env bash + +DOCKER_CMD ?= docker +DESIRED_ROCM ?= 6.4 +DESIRED_ROCM_SHORT = $(subst .,,$(DESIRED_ROCM)) +PACKAGE_NAME = magma-rocm +# inherit this from underlying docker image, do not pass this env var to docker +#PYTORCH_ROCM_ARCH ?= gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201 + +DOCKER_RUN = set -eou pipefail; ${DOCKER_CMD} run --rm -i \ + -v $(shell git rev-parse --show-toplevel)/.ci:/builder \ + -w /builder \ + -e PACKAGE_NAME=${PACKAGE_NAME}${DESIRED_ROCM_SHORT} \ + -e DESIRED_ROCM=${DESIRED_ROCM} \ + "pytorch/almalinux-builder:rocm${DESIRED_ROCM}" \ + magma-rocm/build_magma.sh + +.PHONY: all +all: magma-rocm64 +all: magma-rocm63 + +.PHONY: +clean: + $(RM) -r magma-* + $(RM) -r output + +.PHONY: magma-rocm64 +magma-rocm64: DESIRED_ROCM := 6.4 +magma-rocm64: + $(DOCKER_RUN) + +.PHONY: magma-rocm63 +magma-rocm63: DESIRED_ROCM := 6.3 +magma-rocm63: + $(DOCKER_RUN) diff --git a/.ci/magma-rocm/README.md b/.ci/magma-rocm/README.md new file mode 100644 index 000000000000..cfc3cd3ab163 --- /dev/null +++ b/.ci/magma-rocm/README.md @@ -0,0 +1,48 @@ +# Magma ROCm + +This folder contains the scripts and configurations to build libmagma.so, linked for various versions of ROCm. + +## Building + +Look in the `Makefile` for available targets to build. To build any target, for example `magma-rocm63`, run + +``` +# Using `docker` +make magma-rocm63 + +# Using `podman` +DOCKER_CMD=podman make magma-rocm63 +``` + +This spawns a `pytorch/manylinux-rocm` docker image, which has the required `devtoolset` and ROCm versions installed. +Within the docker image, it runs `build_magma.sh` with the correct environment variables set, which package the necessary files +into a tarball, with the following structure: + +``` +. +├── include # header files +├── lib # libmagma.so +├── info +│ ├── licenses # license file +│ └── recipe # build script +``` + +More specifically, `build_magma.sh` copies over the relevant files from the `package_files` directory depending on the ROCm version. +Outputted binaries should be in the `output` folder. + + +## Pushing + +Packages can be uploaded to an S3 bucket using: + +``` +aws s3 cp output/*/magma-cuda*.bz2 +``` + +If you do not have upload permissions, please ping @seemethere or @soumith to gain access + +## New versions + +New ROCm versions can be added by creating a new make target with the next desired version. For ROCm version N.n, the target should be named `magma-rocmNn`. + +Make sure to edit the appropriate environment variables (e.g., DESIRED_ROCM) in the `Makefile` accordingly. Remember also to check `build_magma.sh` to ensure the logic for copying over the files remains correct. diff --git a/.ci/magma-rocm/build_magma.sh b/.ci/magma-rocm/build_magma.sh new file mode 100755 index 000000000000..4acb3fb0dc3b --- /dev/null +++ b/.ci/magma-rocm/build_magma.sh @@ -0,0 +1,42 @@ +#!/usr/bin/env bash + +set -eou pipefail + +# Environment variables +# The script expects DESIRED_CUDA and PACKAGE_NAME to be set +ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" + +# Version 2.7.2 + ROCm related updates +MAGMA_VERSION=a1625ff4d9bc362906bd01f805dbbe12612953f6 + +# Folders for the build +PACKAGE_FILES=${ROOT_DIR}/magma-rocm/package_files # metadata +PACKAGE_DIR=${ROOT_DIR}/magma-rocm/${PACKAGE_NAME} # build workspace +PACKAGE_OUTPUT=${ROOT_DIR}/magma-rocm/output # where tarballs are stored +PACKAGE_BUILD=${PACKAGE_DIR} # where the content of the tarball is prepared +PACKAGE_RECIPE=${PACKAGE_BUILD}/info/recipe +PACKAGE_LICENSE=${PACKAGE_BUILD}/info/licenses +mkdir -p ${PACKAGE_DIR} ${PACKAGE_OUTPUT}/linux-64 ${PACKAGE_BUILD} ${PACKAGE_RECIPE} ${PACKAGE_LICENSE} + +# Fetch magma sources and verify checksum +pushd ${PACKAGE_DIR} +git clone https://bitbucket.org/icl/magma.git +pushd magma +git checkout ${MAGMA_VERSION} +popd +popd + +# build +pushd ${PACKAGE_DIR}/magma +# The build.sh script expects to be executed from the sources root folder +INSTALL_DIR=${PACKAGE_BUILD} ${PACKAGE_FILES}/build.sh +popd + +# Package recipe, license and tarball +# Folder and package name are backward compatible for the build workflow +cp ${PACKAGE_FILES}/build.sh ${PACKAGE_RECIPE}/build.sh +cp ${PACKAGE_DIR}/magma/COPYRIGHT ${PACKAGE_LICENSE}/COPYRIGHT +pushd ${PACKAGE_BUILD} +tar cjf ${PACKAGE_OUTPUT}/linux-64/${PACKAGE_NAME}-${MAGMA_VERSION}-1.tar.bz2 include lib info +echo Built in ${PACKAGE_OUTPUT}/linux-64/${PACKAGE_NAME}-${MAGMA_VERSION}-1.tar.bz2 +popd diff --git a/.ci/magma-rocm/package_files/build.sh b/.ci/magma-rocm/package_files/build.sh new file mode 100755 index 000000000000..d0f0911db525 --- /dev/null +++ b/.ci/magma-rocm/package_files/build.sh @@ -0,0 +1,38 @@ +# Magma build scripts need `python` +ln -sf /usr/bin/python3 /usr/bin/python + +ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"') +case "$ID" in + almalinux) + yum install -y gcc-gfortran + ;; + *) + echo "No preinstalls to build magma..." + ;; +esac + +MKLROOT=${MKLROOT:-/opt/conda/envs/py_$ANACONDA_PYTHON_VERSION} + +cp make.inc-examples/make.inc.hip-gcc-mkl make.inc +echo 'LIBDIR += -L$(MKLROOT)/lib' >> make.inc +if [[ -f "${MKLROOT}/lib/libmkl_core.a" ]]; then + echo 'LIB = -Wl,--start-group -lmkl_gf_lp64 -lmkl_gnu_thread -lmkl_core -Wl,--end-group -lpthread -lstdc++ -lm -lgomp -lhipblas -lhipsparse' >> make.inc +fi +echo 'LIB += -Wl,--enable-new-dtags -Wl,--rpath,/opt/rocm/lib -Wl,--rpath,$(MKLROOT)/lib -Wl,--rpath,/opt/rocm/magma/lib -ldl' >> make.inc +echo 'DEVCCFLAGS += --gpu-max-threads-per-block=256' >> make.inc +export PATH="${PATH}:/opt/rocm/bin" +if [[ -n "$PYTORCH_ROCM_ARCH" ]]; then + amdgpu_targets=`echo $PYTORCH_ROCM_ARCH | sed 's/;/ /g'` +else + amdgpu_targets=`rocm_agent_enumerator | grep -v gfx000 | sort -u | xargs` +fi +for arch in $amdgpu_targets; do + echo "DEVCCFLAGS += --offload-arch=$arch" >> make.inc +done +# hipcc with openmp flag may cause isnan() on __device__ not to be found; depending on context, compiler may attempt to match with host definition +sed -i 's/^FOPENMP/#FOPENMP/g' make.inc +make -f make.gen.hipMAGMA -j $(nproc) +LANG=C.UTF-8 make lib/libmagma.so -j $(nproc) MKLROOT="${MKLROOT}" +make testing/testing_dgemm -j $(nproc) MKLROOT="${MKLROOT}" +cp -R lib ${INSTALL_DIR} +cp -R include ${INSTALL_DIR} diff --git a/.ci/magma/Makefile b/.ci/magma/Makefile index 17c62b71d4e2..bd8587b94162 100644 --- a/.ci/magma/Makefile +++ b/.ci/magma/Makefile @@ -1,7 +1,11 @@ SHELL=/usr/bin/env bash DOCKER_CMD ?= docker +<<<<<<< HEAD DESIRED_CUDA ?= 11.8 +======= +DESIRED_CUDA ?= 12.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) DESIRED_CUDA_SHORT = $(subst .,,$(DESIRED_CUDA)) PACKAGE_NAME = magma-cuda CUDA_ARCH_LIST ?= -gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90 @@ -12,6 +16,7 @@ DOCKER_RUN = set -eou pipefail; ${DOCKER_CMD} run --rm -i \ -e PACKAGE_NAME=${PACKAGE_NAME}${DESIRED_CUDA_SHORT} \ -e DESIRED_CUDA=${DESIRED_CUDA} \ -e CUDA_ARCH_LIST="${CUDA_ARCH_LIST}" \ +<<<<<<< HEAD "pytorch/manylinux2_28-builder:cuda${DESIRED_CUDA}-main" \ magma/build_magma.sh @@ -20,12 +25,30 @@ all: magma-cuda128 all: magma-cuda126 all: magma-cuda124 all: magma-cuda118 +======= + "pytorch/almalinux-builder:cuda${DESIRED_CUDA}-main" \ + magma/build_magma.sh + +.PHONY: all +all: magma-cuda129 +all: magma-cuda128 +all: magma-cuda126 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) .PHONY: clean: $(RM) -r magma-* $(RM) -r output +<<<<<<< HEAD +======= +.PHONY: magma-cuda129 +magma-cuda129: DESIRED_CUDA := 12.9 +magma-cuda129: CUDA_ARCH_LIST += -gencode arch=compute_100,code=sm_100 -gencode arch=compute_120,code=sm_120 +magma-cuda129: + $(DOCKER_RUN) + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) .PHONY: magma-cuda128 magma-cuda128: DESIRED_CUDA := 12.8 magma-cuda128: CUDA_ARCH_LIST += -gencode arch=compute_100,code=sm_100 -gencode arch=compute_120,code=sm_120 @@ -36,6 +59,7 @@ magma-cuda128: magma-cuda126: DESIRED_CUDA := 12.6 magma-cuda126: $(DOCKER_RUN) +<<<<<<< HEAD .PHONY: magma-cuda124 magma-cuda124: DESIRED_CUDA := 12.4 @@ -47,3 +71,5 @@ magma-cuda118: DESIRED_CUDA := 11.8 magma-cuda118: CUDA_ARCH_LIST += -gencode arch=compute_37,code=sm_37 magma-cuda118: $(DOCKER_RUN) +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) diff --git a/.ci/manywheel/build_common.sh b/.ci/manywheel/build_common.sh index 025b19a24a98..daaad261e671 100644 --- a/.ci/manywheel/build_common.sh +++ b/.ci/manywheel/build_common.sh @@ -18,12 +18,19 @@ retry () { $* || (sleep 1 && $*) || (sleep 2 && $*) || (sleep 4 && $*) || (sleep 8 && $*) } +<<<<<<< HEAD PLATFORM="manylinux2014_x86_64" # TODO move this into the Docker images OS_NAME=$(awk -F= '/^NAME/{print $2}' /etc/os-release) if [[ "$OS_NAME" == *"CentOS Linux"* ]]; then retry yum install -q -y zip openssl elif [[ "$OS_NAME" == *"AlmaLinux"* ]]; then +======= +PLATFORM="" +# TODO move this into the Docker images +OS_NAME=$(awk -F= '/^NAME/{print $2}' /etc/os-release) +if [[ "$OS_NAME" == *"AlmaLinux"* ]]; then +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) retry yum install -q -y zip openssl PLATFORM="manylinux_2_28_x86_64" elif [[ "$OS_NAME" == *"Red Hat Enterprise Linux"* ]]; then @@ -33,9 +40,17 @@ elif [[ "$OS_NAME" == *"Ubuntu"* ]]; then # Comment out nvidia repositories to prevent them from getting apt-get updated, see https://github.com/pytorch/pytorch/issues/74968 # shellcheck disable=SC2046 sed -i 's/.*nvidia.*/# &/' $(find /etc/apt/ -type f -name "*.list") +<<<<<<< HEAD retry apt-get update retry apt-get -y install zip openssl +======= + retry apt-get update + retry apt-get -y install zip openssl +else + echo "Unknown OS: '$OS_NAME'" + exit 1 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) fi # We use the package name to test the package by passing this to 'pip install' @@ -79,8 +94,11 @@ if [[ -e /opt/openssl ]]; then export CMAKE_INCLUDE_PATH="/opt/openssl/include":$CMAKE_INCLUDE_PATH fi +<<<<<<< HEAD +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) mkdir -p /tmp/$WHEELHOUSE_DIR export PATCHELF_BIN=/usr/local/bin/patchelf @@ -99,6 +117,10 @@ if [[ -z "$PYTORCH_ROOT" ]]; then exit 1 fi pushd "$PYTORCH_ROOT" +<<<<<<< HEAD +======= +retry pip install -q cmake +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) python setup.py clean retry pip install -qr requirements.txt case ${DESIRED_PYTHON} in @@ -111,12 +133,15 @@ case ${DESIRED_PYTHON} in ;; esac +<<<<<<< HEAD if [[ "$DESIRED_DEVTOOLSET" == *"cxx11-abi"* ]]; then export _GLIBCXX_USE_CXX11_ABI=1 else export _GLIBCXX_USE_CXX11_ABI=0 fi +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if [[ "$DESIRED_CUDA" == *"rocm"* ]]; then echo "Calling build_amd.py at $(date)" python tools/amd_build/build_amd.py @@ -158,7 +183,11 @@ if [[ "$USE_SPLIT_BUILD" == "true" ]]; then BUILD_LIBTORCH_WHL=0 BUILD_PYTHON_ONLY=1 \ BUILD_LIBTORCH_CPU_WITH_DEBUG=$BUILD_DEBUG_INFO \ USE_NCCL=${USE_NCCL} USE_RCCL=${USE_RCCL} USE_KINETO=${USE_KINETO} \ +<<<<<<< HEAD python setup.py bdist_wheel -d /tmp/$WHEELHOUSE_DIR --cmake +======= + CMAKE_FRESH=1 python setup.py bdist_wheel -d /tmp/$WHEELHOUSE_DIR +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) echo "Finished setup.py bdist_wheel for split build (BUILD_PYTHON_ONLY)" else time CMAKE_ARGS=${CMAKE_ARGS[@]} \ @@ -209,12 +238,15 @@ if [[ -n "$BUILD_PYTHONLESS" ]]; then mkdir -p /tmp/$LIBTORCH_HOUSE_DIR +<<<<<<< HEAD if [[ "$DESIRED_DEVTOOLSET" == *"cxx11-abi"* ]]; then LIBTORCH_ABI="cxx11-abi-" else LIBTORCH_ABI= fi +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) zip -rq /tmp/$LIBTORCH_HOUSE_DIR/libtorch-$LIBTORCH_ABI$LIBTORCH_VARIANT-$PYTORCH_BUILD_VERSION.zip libtorch cp /tmp/$LIBTORCH_HOUSE_DIR/libtorch-$LIBTORCH_ABI$LIBTORCH_VARIANT-$PYTORCH_BUILD_VERSION.zip \ /tmp/$LIBTORCH_HOUSE_DIR/libtorch-$LIBTORCH_ABI$LIBTORCH_VARIANT-latest.zip diff --git a/.ci/manywheel/build_cuda.sh b/.ci/manywheel/build_cuda.sh index 8f8b37b46e59..dc07294b1450 100644 --- a/.ci/manywheel/build_cuda.sh +++ b/.ci/manywheel/build_cuda.sh @@ -15,6 +15,12 @@ export INSTALL_TEST=0 # dont install test binaries into site-packages export USE_CUPTI_SO=0 export USE_CUSPARSELT=${USE_CUSPARSELT:-1} # Enable if not disabled by libtorch build export USE_CUFILE=${USE_CUFILE:-1} +<<<<<<< HEAD +======= +export USE_SYSTEM_NCCL=1 +export NCCL_INCLUDE_DIR="/usr/local/cuda/include/" +export NCCL_LIB_DIR="/usr/local/cuda/lib64/" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # Keep an array of cmake variables to add to if [[ -z "$CMAKE_ARGS" ]]; then @@ -36,10 +42,15 @@ if [[ -n "$DESIRED_CUDA" ]]; then if [[ ${DESIRED_CUDA} =~ ^[0-9]+\.[0-9]+$ ]]; then CUDA_VERSION=${DESIRED_CUDA} else +<<<<<<< HEAD # cu90, cu92, cu100, cu101 if [[ ${#DESIRED_CUDA} -eq 4 ]]; then CUDA_VERSION="${DESIRED_CUDA:2:1}.${DESIRED_CUDA:3:1}" elif [[ ${#DESIRED_CUDA} -eq 5 ]]; then +======= + # cu126, cu128 etc... + if [[ ${#DESIRED_CUDA} -eq 5 ]]; then +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) CUDA_VERSION="${DESIRED_CUDA:2:2}.${DESIRED_CUDA:4:1}" fi fi @@ -50,6 +61,7 @@ else fi cuda_version_nodot=$(echo $CUDA_VERSION | tr -d '.') +<<<<<<< HEAD TORCH_CUDA_ARCH_LIST="5.0;6.0;7.0;7.5;8.0;8.6" case ${CUDA_VERSION} in @@ -68,6 +80,25 @@ case ${CUDA_VERSION} in 11.8) TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST};3.7;9.0" EXTRA_CAFFE2_CMAKE_FLAGS+=("-DATEN_NO_TEST=ON") +======= +EXTRA_CAFFE2_CMAKE_FLAGS+=("-DATEN_NO_TEST=ON") + +case ${CUDA_VERSION} in + #removing sm_50-sm_60 as these architectures are deprecated in CUDA 12.8/9 and will be removed in future releases + #however we would like to keep sm_70 architecture see: https://github.com/pytorch/pytorch/issues/157517 + 12.8) + TORCH_CUDA_ARCH_LIST="7.0;7.5;8.0;8.6;9.0;10.0;12.0" + ;; + 12.9) + TORCH_CUDA_ARCH_LIST="7.0;7.5;8.0;8.6;9.0;10.0;12.0+PTX" + # WAR to resolve the ld error in libtorch build with CUDA 12.9 + if [[ "$PACKAGE_TYPE" == "libtorch" ]]; then + TORCH_CUDA_ARCH_LIST="7.5;8.0;9.0;10.0;12.0+PTX" + fi + ;; + 12.6) + TORCH_CUDA_ARCH_LIST="5.0;6.0;7.0;7.5;8.0;8.6;9.0" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ;; *) echo "unknown cuda version $CUDA_VERSION" @@ -91,14 +122,24 @@ fi mkdir -p "$PYTORCH_FINAL_PACKAGE_DIR" || true OS_NAME=$(awk -F= '/^NAME/{print $2}' /etc/os-release) +<<<<<<< HEAD if [[ "$OS_NAME" == *"CentOS Linux"* ]]; then LIBGOMP_PATH="/usr/lib64/libgomp.so.1" elif [[ "$OS_NAME" == *"AlmaLinux"* ]]; then +======= +if [[ "$OS_NAME" == *"AlmaLinux"* ]]; then +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) LIBGOMP_PATH="/usr/lib64/libgomp.so.1" elif [[ "$OS_NAME" == *"Red Hat Enterprise Linux"* ]]; then LIBGOMP_PATH="/usr/lib64/libgomp.so.1" elif [[ "$OS_NAME" == *"Ubuntu"* ]]; then LIBGOMP_PATH="/usr/lib/x86_64-linux-gnu/libgomp.so.1" +<<<<<<< HEAD +======= +else + echo "Unknown OS: '$OS_NAME'" + exit 1 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) fi DEPS_LIST=( @@ -108,6 +149,7 @@ DEPS_SONAME=( "libgomp.so.1" ) +<<<<<<< HEAD # CUDA 11.8 have to ship the libcusparseLt.so.0 with the binary # since nvidia-cusparselt-cu11 is not available in PYPI if [[ $USE_CUSPARSELT == "1" && $CUDA_VERSION == "11.8" ]]; then @@ -128,11 +170,18 @@ fi # CUDA_VERSION 12.4, 12.6, 12.8 +======= + +# CUDA_VERSION 12.6, 12.8, 12.9 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if [[ $CUDA_VERSION == 12* ]]; then export USE_STATIC_CUDNN=0 # Try parallelizing nvcc as well export TORCH_NVCC_FLAGS="-Xfatbin -compress-all --threads 2" +<<<<<<< HEAD +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if [[ -z "$PYTORCH_EXTRA_INSTALL_REQUIREMENTS" ]]; then echo "Bundling with cudnn and cublas." DEPS_LIST+=( @@ -148,9 +197,16 @@ if [[ $CUDA_VERSION == 12* ]]; then "/usr/local/cuda/lib64/libcublasLt.so.12" "/usr/local/cuda/lib64/libcusparseLt.so.0" "/usr/local/cuda/lib64/libcudart.so.12" +<<<<<<< HEAD "/usr/local/cuda/lib64/libnvToolsExt.so.1" "/usr/local/cuda/lib64/libnvrtc.so.12" "/usr/local/cuda/lib64/libnvrtc-builtins.so" +======= + "/usr/local/cuda/lib64/libnvrtc.so.12" + "/usr/local/cuda/lib64/libnvrtc-builtins.so" + "/usr/local/cuda/lib64/libcufile.so.0" + "/usr/local/cuda/lib64/libcufile_rdma.so.1" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ) DEPS_SONAME+=( "libcudnn_adv.so.9" @@ -165,6 +221,7 @@ if [[ $CUDA_VERSION == 12* ]]; then "libcublasLt.so.12" "libcusparseLt.so.0" "libcudart.so.12" +<<<<<<< HEAD "libnvToolsExt.so.1" "libnvrtc.so.12" "libnvrtc-builtins.so" @@ -179,6 +236,13 @@ if [[ $CUDA_VERSION == 12* ]]; then "libcufile_rdma.so.1" ) fi +======= + "libnvrtc.so.12" + "libnvrtc-builtins.so" + "libcufile.so.0" + "libcufile_rdma.so.1" + ) +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) else echo "Using nvidia libs from pypi." CUDA_RPATHS=( @@ -191,6 +255,7 @@ if [[ $CUDA_VERSION == 12* ]]; then '$ORIGIN/../../nvidia/curand/lib' '$ORIGIN/../../nvidia/cusolver/lib' '$ORIGIN/../../nvidia/cusparse/lib' +<<<<<<< HEAD '$ORIGIN/../../cusparselt/lib' '$ORIGIN/../../nvidia/nccl/lib' '$ORIGIN/../../nvidia/nvtx/lib' @@ -267,18 +332,31 @@ elif [[ $CUDA_VERSION == "11.8" ]]; then '$ORIGIN/../../nvidia/cusparse/lib' '$ORIGIN/../../nvidia/nccl/lib' '$ORIGIN/../../nvidia/nvtx/lib' +======= + '$ORIGIN/../../nvidia/cusparselt/lib' + '$ORIGIN/../../cusparselt/lib' + '$ORIGIN/../../nvidia/nccl/lib' + '$ORIGIN/../../nvidia/nvtx/lib' + '$ORIGIN/../../nvidia/cufile/lib' +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ) CUDA_RPATHS=$(IFS=: ; echo "${CUDA_RPATHS[*]}") export C_SO_RPATH=$CUDA_RPATHS':$ORIGIN:$ORIGIN/lib' export LIB_SO_RPATH=$CUDA_RPATHS':$ORIGIN' export FORCE_RPATH="--force-rpath" export USE_STATIC_NCCL=0 +<<<<<<< HEAD export USE_SYSTEM_NCCL=1 export ATEN_STATIC_CUDA=0 export USE_CUDA_STATIC_LINK=0 export USE_CUPTI_SO=1 export NCCL_INCLUDE_DIR="/usr/local/cuda/include/" export NCCL_LIB_DIR="/usr/local/cuda/lib64/" +======= + export ATEN_STATIC_CUDA=0 + export USE_CUDA_STATIC_LINK=0 + export USE_CUPTI_SO=1 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) fi else echo "Unknown cuda version $CUDA_VERSION" diff --git a/.ci/manywheel/build_libtorch.sh b/.ci/manywheel/build_libtorch.sh index 41d8c4e15272..f2da1a850d2d 100644 --- a/.ci/manywheel/build_libtorch.sh +++ b/.ci/manywheel/build_libtorch.sh @@ -22,9 +22,13 @@ retry () { # TODO move this into the Docker images OS_NAME=`awk -F= '/^NAME/{print $2}' /etc/os-release` +<<<<<<< HEAD if [[ "$OS_NAME" == *"CentOS Linux"* ]]; then retry yum install -q -y zip openssl elif [[ "$OS_NAME" == *"AlmaLinux"* ]]; then +======= +if [[ "$OS_NAME" == *"AlmaLinux"* ]]; then +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) retry yum install -q -y zip openssl elif [[ "$OS_NAME" == *"Red Hat Enterprise Linux"* ]]; then retry dnf install -q -y zip openssl @@ -35,6 +39,12 @@ elif [[ "$OS_NAME" == *"Ubuntu"* ]]; then sed -i 's/.*nvidia.*/# &/' $(find /etc/apt/ -type f -name "*.list") retry apt-get update retry apt-get -y install zip openssl +<<<<<<< HEAD +======= +else + echo "Unknown OS: '$OS_NAME'" + exit 1 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) fi # Version: setup.py uses $PYTORCH_BUILD_VERSION.post$PYTORCH_BUILD_NUMBER if @@ -91,16 +101,23 @@ if [[ -z "$PYTORCH_ROOT" ]]; then exit 1 fi pushd "$PYTORCH_ROOT" +<<<<<<< HEAD +======= +retry pip install -q cmake +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) python setup.py clean retry pip install -qr requirements.txt retry pip install -q numpy==2.0.1 +<<<<<<< HEAD if [[ "$DESIRED_DEVTOOLSET" == *"cxx11-abi"* ]]; then export _GLIBCXX_USE_CXX11_ABI=1 else export _GLIBCXX_USE_CXX11_ABI=0 fi +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if [[ "$DESIRED_CUDA" == *"rocm"* ]]; then echo "Calling build_amd.py at $(date)" python tools/amd_build/build_amd.py @@ -169,12 +186,15 @@ fi ) +<<<<<<< HEAD if [[ "$DESIRED_DEVTOOLSET" == *"cxx11-abi"* ]]; then LIBTORCH_ABI="cxx11-abi-" else LIBTORCH_ABI= fi +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ( set -x diff --git a/.ci/manywheel/build_rocm.sh b/.ci/manywheel/build_rocm.sh index 703248d44aa9..331b3f369516 100755 --- a/.ci/manywheel/build_rocm.sh +++ b/.ci/manywheel/build_rocm.sh @@ -95,6 +95,10 @@ ROCM_SO_FILES=( "libroctracer64.so" "libroctx64.so" "libhipblaslt.so" +<<<<<<< HEAD +======= + "libhipsparselt.so" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) "libhiprtc.so" ) @@ -186,6 +190,7 @@ do OS_SO_FILES[${#OS_SO_FILES[@]}]=$file_name # Append lib to array done +<<<<<<< HEAD # rocBLAS library files ROCBLAS_LIB_SRC=$ROCM_HOME/lib/rocblas/library ROCBLAS_LIB_DST=lib/rocblas/library @@ -193,13 +198,36 @@ ARCH=$(echo $PYTORCH_ROCM_ARCH | sed 's/;/|/g') # Replace ; seperated arch list ARCH_SPECIFIC_FILES=$(ls $ROCBLAS_LIB_SRC | grep -E $ARCH) OTHER_FILES=$(ls $ROCBLAS_LIB_SRC | grep -v gfx) ROCBLAS_LIB_FILES=($ARCH_SPECIFIC_FILES $OTHER_FILES) +======= +ARCH=$(echo $PYTORCH_ROCM_ARCH | sed 's/;/|/g') # Replace ; separated arch list to bar for grep + +# rocBLAS library files +ROCBLAS_LIB_SRC=$ROCM_HOME/lib/rocblas/library +ROCBLAS_LIB_DST=lib/rocblas/library +ROCBLAS_ARCH_SPECIFIC_FILES=$(ls $ROCBLAS_LIB_SRC | grep -E $ARCH) +ROCBLAS_OTHER_FILES=$(ls $ROCBLAS_LIB_SRC | grep -v gfx) +ROCBLAS_LIB_FILES=($ROCBLAS_ARCH_SPECIFIC_FILES $OTHER_FILES) +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # hipblaslt library files HIPBLASLT_LIB_SRC=$ROCM_HOME/lib/hipblaslt/library HIPBLASLT_LIB_DST=lib/hipblaslt/library +<<<<<<< HEAD ARCH_SPECIFIC_FILES=$(ls $HIPBLASLT_LIB_SRC | grep -E $ARCH) OTHER_FILES=$(ls $HIPBLASLT_LIB_SRC | grep -v gfx) HIPBLASLT_LIB_FILES=($ARCH_SPECIFIC_FILES $OTHER_FILES) +======= +HIPBLASLT_ARCH_SPECIFIC_FILES=$(ls $HIPBLASLT_LIB_SRC | grep -E $ARCH) +HIPBLASLT_OTHER_FILES=$(ls $HIPBLASLT_LIB_SRC | grep -v gfx) +HIPBLASLT_LIB_FILES=($HIPBLASLT_ARCH_SPECIFIC_FILES $HIPBLASLT_OTHER_FILES) + +# hipsparselt library files +HIPSPARSELT_LIB_SRC=$ROCM_HOME/lib/hipsparselt/library +HIPSPARSELT_LIB_DST=lib/hipsparselt/library +HIPSPARSELT_ARCH_SPECIFIC_FILES=$(ls $HIPSPARSELT_LIB_SRC | grep -E $ARCH) +#HIPSPARSELT_OTHER_FILES=$(ls $HIPSPARSELT_LIB_SRC | grep -v gfx) +HIPSPARSELT_LIB_FILES=($HIPSPARSELT_ARCH_SPECIFIC_FILES $HIPSPARSELT_OTHER_FILES) +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # ROCm library files ROCM_SO_PATHS=() @@ -234,12 +262,20 @@ DEPS_SONAME=( DEPS_AUX_SRCLIST=( "${ROCBLAS_LIB_FILES[@]/#/$ROCBLAS_LIB_SRC/}" "${HIPBLASLT_LIB_FILES[@]/#/$HIPBLASLT_LIB_SRC/}" +<<<<<<< HEAD +======= + "${HIPSPARSELT_LIB_FILES[@]/#/$HIPSPARSELT_LIB_SRC/}" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) "/opt/amdgpu/share/libdrm/amdgpu.ids" ) DEPS_AUX_DSTLIST=( "${ROCBLAS_LIB_FILES[@]/#/$ROCBLAS_LIB_DST/}" "${HIPBLASLT_LIB_FILES[@]/#/$HIPBLASLT_LIB_DST/}" +<<<<<<< HEAD +======= + "${HIPSPARSELT_LIB_FILES[@]/#/$HIPSPARSELT_LIB_DST/}" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) "share/libdrm/amdgpu.ids" ) diff --git a/.ci/manywheel/build_xpu.sh b/.ci/manywheel/build_xpu.sh index 2bc60dd10727..b6016d45a96a 100755 --- a/.ci/manywheel/build_xpu.sh +++ b/.ci/manywheel/build_xpu.sh @@ -20,7 +20,15 @@ fi source /opt/intel/oneapi/compiler/latest/env/vars.sh source /opt/intel/oneapi/pti/latest/env/vars.sh source /opt/intel/oneapi/umf/latest/env/vars.sh +<<<<<<< HEAD export USE_STATIC_MKL=1 +======= +source /opt/intel/oneapi/ccl/latest/env/vars.sh +source /opt/intel/oneapi/mpi/latest/env/vars.sh +export USE_STATIC_MKL=1 +export USE_ONEMKL=1 +export USE_XCCL=1 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) WHEELHOUSE_DIR="wheelhousexpu" LIBTORCH_HOUSE_DIR="libtorch_housexpu" diff --git a/.ci/onnx/README.md b/.ci/onnx/README.md index 837e9b7d8109..bee250c271ef 100644 --- a/.ci/onnx/README.md +++ b/.ci/onnx/README.md @@ -10,5 +10,8 @@ example: `py2-cuda9.0-cudnn7-ubuntu16.04`. The Docker images that are built on Jenkins and are used in triggered builds already have this environment variable set in their manifest. Also see `./docker/jenkins/*/Dockerfile` and search for `BUILD_ENVIRONMENT`. +<<<<<<< HEAD Our Jenkins installation is located at https://ci.pytorch.org/jenkins/. +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) diff --git a/.ci/pytorch/build.sh b/.ci/pytorch/build.sh index dfc4e0fab927..58137d79acdf 100755 --- a/.ci/pytorch/build.sh +++ b/.ci/pytorch/build.sh @@ -27,6 +27,15 @@ cmake --version echo "Environment variables:" env +<<<<<<< HEAD +======= +# The sccache wrapped version of nvcc gets put in /opt/cache/lib in docker since +# there are some issues if it is always wrapped, so we need to add it to PATH +# during CI builds. +# https://github.com/pytorch/pytorch/blob/0b6c0898e6c352c8ea93daec854e704b41485375/.ci/docker/common/install_cache.sh#L97 +export PATH="/opt/cache/lib:$PATH" + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if [[ "$BUILD_ENVIRONMENT" == *cuda* ]]; then # Use jemalloc during compilation to mitigate https://github.com/pytorch/pytorch/issues/116289 export LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libjemalloc.so.2 @@ -35,7 +44,11 @@ if [[ "$BUILD_ENVIRONMENT" == *cuda* ]]; then fi if [[ "$BUILD_ENVIRONMENT" == *cuda11* ]]; then +<<<<<<< HEAD if [[ "$BUILD_ENVIRONMENT" != *cuda11.3* && "$BUILD_ENVIRONMENT" != *clang* ]]; then +======= + if [[ "$BUILD_ENVIRONMENT" != *clang* ]]; then +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # TODO: there is a linking issue when building with UCC using clang, # disable it for now and to be fix later. # TODO: disable UCC temporarily to enable CUDA 12.1 in CI @@ -52,12 +65,15 @@ fi export USE_LLVM=/opt/llvm export LLVM_DIR=/opt/llvm/lib/cmake/llvm +<<<<<<< HEAD if [[ "$BUILD_ENVIRONMENT" == *executorch* ]]; then # To build test_edge_op_registration export BUILD_EXECUTORCH=ON export USE_CUDA=0 fi +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if ! which conda; then # In ROCm CIs, we are doing cross compilation on build machines with # intel cpu and later run tests on machines with amd cpu. @@ -171,6 +187,15 @@ fi if [[ "$BUILD_ENVIRONMENT" == *xpu* ]]; then # shellcheck disable=SC1091 source /opt/intel/oneapi/compiler/latest/env/vars.sh +<<<<<<< HEAD +======= + # shellcheck disable=SC1091 + source /opt/intel/oneapi/ccl/latest/env/vars.sh + # shellcheck disable=SC1091 + source /opt/intel/oneapi/mpi/latest/env/vars.sh + # Enable XCCL build + export USE_XCCL=1 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # XPU kineto feature dependencies are not fully ready, disable kineto build as temp WA export USE_KINETO=0 export TORCH_XPU_ARCH_LIST=pvc @@ -251,6 +276,10 @@ if [[ "$BUILD_ENVIRONMENT" == *-bazel-* ]]; then set -e -o pipefail get_bazel +<<<<<<< HEAD +======= + python3 tools/optional_submodules.py checkout_eigen +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # Leave 1 CPU free and use only up to 80% of memory to reduce the change of crashing # the runner @@ -277,10 +306,15 @@ else # or building non-XLA tests. if [[ "$BUILD_ENVIRONMENT" != *rocm* && "$BUILD_ENVIRONMENT" != *xla* ]]; then +<<<<<<< HEAD if [[ "$BUILD_ENVIRONMENT" != *py3.8* ]]; then # Install numpy-2.0.2 for builds which are backward compatible with 1.X python -mpip install numpy==2.0.2 fi +======= + # Install numpy-2.0.2 for builds which are backward compatible with 1.X + python -mpip install numpy==2.0.2 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) WERROR=1 python setup.py clean @@ -303,6 +337,21 @@ else fi pip_install_whl "$(echo dist/*.whl)" +<<<<<<< HEAD +======= + if [[ "$BUILD_ENVIRONMENT" == *xpu* ]]; then + echo "Checking that xpu is compiled" + pushd dist/ + if python -c 'import torch; exit(0 if torch.xpu._is_compiled() else 1)'; then + echo "XPU support is compiled in." + else + echo "XPU support is NOT compiled in." + exit 1 + fi + popd + fi + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # TODO: I'm not sure why, but somehow we lose verbose commands set -x diff --git a/.ci/pytorch/check_binary.sh b/.ci/pytorch/check_binary.sh index f77d8a5b0777..74c7f301eff7 100755 --- a/.ci/pytorch/check_binary.sh +++ b/.ci/pytorch/check_binary.sh @@ -63,6 +63,7 @@ fi # Check GCC ABI ############################################################################### +<<<<<<< HEAD # NOTE [ Building libtorch with old vs. new gcc ABI ] # # Packages built with one version of ABI could not be linked against by client @@ -121,6 +122,14 @@ if [[ "$(uname)" != 'Darwin' ]]; then fi # We also check that there are [not] cxx11 symbols in libtorch +======= +# NOTE: As of https://github.com/pytorch/pytorch/issues/126551 we only produce +# wheels with cxx11-abi + +echo "Checking that the gcc ABI is what we expect" +if [[ "$(uname)" != 'Darwin' ]]; then + # We also check that there are cxx11 symbols in libtorch +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # echo "Checking that symbols in libtorch.so have the right gcc abi" python3 "$(dirname ${BASH_SOURCE[0]})/smoke_test/check_binary_symbols.py" @@ -198,6 +207,7 @@ setup_link_flags () { TEST_CODE_DIR="$(dirname $(realpath ${BASH_SOURCE[0]}))/test_example_code" build_and_run_example_cpp () { +<<<<<<< HEAD if [[ "$DESIRED_DEVTOOLSET" == *"cxx11-abi"* ]]; then GLIBCXX_USE_CXX11_ABI=1 else @@ -227,6 +237,13 @@ build_example_cpp_with_incorrect_abi () { fi } +======= + setup_link_flags + g++ ${TEST_CODE_DIR}/$1.cpp -I${install_root}/include -I${install_root}/include/torch/csrc/api/include -std=gnu++17 -L${install_root}/lib ${REF_LIB} ${ADDITIONAL_LINKER_FLAGS} -ltorch $TORCH_CPU_LINK_FLAGS $TORCH_CUDA_LINK_FLAGS $C10_LINK_FLAGS -o $1 + ./$1 +} + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ############################################################################### # Check simple Python/C++ calls ############################################################################### @@ -236,11 +253,14 @@ if [[ "$PACKAGE_TYPE" == 'libtorch' ]]; then export LD_LIBRARY_PATH=/usr/local/cuda/lib64 fi build_and_run_example_cpp simple-torch-test +<<<<<<< HEAD # `_GLIBCXX_USE_CXX11_ABI` is always ignored by gcc in devtoolset7, so we test # the expected failure case for Ubuntu 16.04 + gcc 5.4 only. if [[ "$DESIRED_DEVTOOLSET" == *"cxx11-abi"* ]]; then build_example_cpp_with_incorrect_abi simple-torch-test fi +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) else pushd /tmp python -c 'import torch' @@ -298,6 +318,17 @@ else fi ############################################################################### +<<<<<<< HEAD +======= +# Check XPU configured correctly +############################################################################### +if [[ "$DESIRED_CUDA" == 'xpu' && "$PACKAGE_TYPE" != 'libtorch' ]]; then + echo "Checking that xpu is compiled" + python -c 'import torch; exit(0 if torch.xpu._is_compiled() else 1)' +fi + +############################################################################### +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # Check CUDA configured correctly ############################################################################### # Skip these for Windows machines without GPUs @@ -375,10 +406,30 @@ except RuntimeError as e: fi ############################################################################### +<<<<<<< HEAD # Check for C++ ABI compatibility between gcc7 and gcc9 compiled binaries ############################################################################### if [[ "$(uname)" == 'Linux' && "$PACKAGE_TYPE" == 'manywheel' ]]; then pushd /tmp python -c "import torch; exit(0 if torch.compiled_with_cxx11_abi() else (0 if torch._C._PYBIND11_BUILD_ABI == '_cxxabi1011' else 1))" +======= +# Check for C++ ABI compatibility to GCC-11 - GCC 13 +############################################################################### +if [[ "$(uname)" == 'Linux' && "$PACKAGE_TYPE" == 'manywheel' ]]; then + pushd /tmp + # Per https://gcc.gnu.org/onlinedocs/gcc/C_002b_002b-Dialect-Options.html + # gcc-11 is ABI16, gcc-13 is ABI18, gcc-14 is ABI19 + # gcc 11 - CUDA 11.8, xpu, rocm + # gcc 13 - CUDA 12.6, 12.8 and cpu + # Please see issue for reference: https://github.com/pytorch/pytorch/issues/152426 + if [[ "$(uname -m)" == "s390x" ]]; then + cxx_abi="19" + elif [[ "$DESIRED_CUDA" != 'xpu' && "$DESIRED_CUDA" != 'rocm'* ]]; then + cxx_abi="18" + else + cxx_abi="16" + fi + python -c "import torch; exit(0 if torch._C._PYBIND11_BUILD_ABI == '_cxxabi10${cxx_abi}' else 1)" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) popd fi diff --git a/.ci/pytorch/common.sh b/.ci/pytorch/common.sh index e71f6d6eaf0b..e8ccdb3bb635 100644 --- a/.ci/pytorch/common.sh +++ b/.ci/pytorch/common.sh @@ -13,6 +13,7 @@ if [[ "${BUILD_ENVIRONMENT}" == *rocm* ]]; then # HIP_PLATFORM is auto-detected by hipcc; unset to avoid build errors unset HIP_PLATFORM export PYTORCH_TEST_WITH_ROCM=1 +<<<<<<< HEAD # temporary to locate some kernel issues on the CI nodes export HSAKMT_DEBUG_LEVEL=4 # improve rccl performance for distributed tests @@ -20,5 +21,10 @@ if [[ "${BUILD_ENVIRONMENT}" == *rocm* ]]; then fi # TODO: Renable libtorch testing for MacOS, see https://github.com/pytorch/pytorch/issues/62598 +======= +fi + +# TODO: Reenable libtorch testing for MacOS, see https://github.com/pytorch/pytorch/issues/62598 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # shellcheck disable=SC2034 BUILD_TEST_LIBTORCH=0 diff --git a/.ci/pytorch/common_utils.sh b/.ci/pytorch/common_utils.sh index 4f8439bd832d..fc74707ff80b 100644 --- a/.ci/pytorch/common_utils.sh +++ b/.ci/pytorch/common_utils.sh @@ -67,13 +67,21 @@ function pip_install_whl() { # Loop through each path and install individually for path in "${paths[@]}"; do echo "Installing $path" +<<<<<<< HEAD python3 -mpip install "$path" +======= + python3 -mpip install --no-index --no-deps "$path" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) done else # Loop through each argument and install individually for path in "${args[@]}"; do echo "Installing $path" +<<<<<<< HEAD python3 -mpip install "$path" +======= + python3 -mpip install --no-index --no-deps "$path" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) done fi } @@ -159,7 +167,10 @@ function install_torchvision() { fi } +<<<<<<< HEAD +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) function install_torchrec_and_fbgemm() { local torchrec_commit torchrec_commit=$(get_pinned_commit torchrec) @@ -198,7 +209,11 @@ function install_torchrec_and_fbgemm() { function clone_pytorch_xla() { if [[ ! -d ./xla ]]; then +<<<<<<< HEAD git clone --recursive -b r2.7 https://github.com/pytorch/xla.git +======= + git clone --recursive -b r2.8 https://github.com/pytorch/xla.git +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) pushd xla # pin the xla hash so that we don't get broken by changes to xla git checkout "$(cat ../.github/ci_commit_pins/xla.txt)" diff --git a/.ci/pytorch/macos-build.sh b/.ci/pytorch/macos-build.sh index 4a2f63a2ed10..524d04e84dbe 100755 --- a/.ci/pytorch/macos-build.sh +++ b/.ci/pytorch/macos-build.sh @@ -33,6 +33,7 @@ if which sccache > /dev/null; then export PATH="${tmp_dir}:$PATH" fi +<<<<<<< HEAD cross_compile_arm64() { # Cross compilation for arm64 # Explicitly set USE_DISTRIBUTED=0 to align with the default build config on mac. This also serves as the sole CI config that tests @@ -83,6 +84,17 @@ else compile_x86_64 fi +======= +print_cmake_info +if [[ ${BUILD_ENVIRONMENT} == *"distributed"* ]]; then + # Needed for inductor benchmarks, as lots of HF networks make `torch.distribtued` calls + USE_DISTRIBUTED=1 USE_OPENMP=1 WERROR=1 python setup.py bdist_wheel +else + # Explicitly set USE_DISTRIBUTED=0 to align with the default build config on mac. This also serves as the sole CI config that tests + # that building with USE_DISTRIBUTED=0 works at all. See https://github.com/pytorch/pytorch/issues/86448 + USE_DISTRIBUTED=0 USE_OPENMP=1 MACOSX_DEPLOYMENT_TARGET=11.0 WERROR=1 BUILD_TEST=OFF USE_PYTORCH_METAL=1 python setup.py bdist_wheel --plat-name macosx_11_0_arm64 +fi +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if which sccache > /dev/null; then print_sccache_stats fi diff --git a/.ci/pytorch/macos-common.sh b/.ci/pytorch/macos-common.sh index 1c7bc103673d..05feb61bc8d9 100755 --- a/.ci/pytorch/macos-common.sh +++ b/.ci/pytorch/macos-common.sh @@ -20,6 +20,7 @@ print_cmake_info() { CONDA_INSTALLATION_DIR=$(dirname "$CMAKE_EXEC") # Print all libraries under cmake rpath for debugging ls -la "$CONDA_INSTALLATION_DIR/../lib" +<<<<<<< HEAD export CMAKE_EXEC # Explicitly add conda env lib folder to cmake rpath to address the flaky issue @@ -30,4 +31,6 @@ print_cmake_info() { # to trust the executable. EXC_BAD_ACCESS (SIGKILL (Code Signature Invalid)) # with an exit code 137 otherwise codesign -f -s - "${CMAKE_EXEC}" || true +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } diff --git a/.ci/pytorch/macos-test.sh b/.ci/pytorch/macos-test.sh index 179556cc59d0..709074a1d47c 100755 --- a/.ci/pytorch/macos-test.sh +++ b/.ci/pytorch/macos-test.sh @@ -5,11 +5,14 @@ set -x # shellcheck source=./macos-common.sh source "$(dirname "${BASH_SOURCE[0]}")/macos-common.sh" +<<<<<<< HEAD if [[ -n "$CONDA_ENV" ]]; then # Use binaries under conda environment export PATH="$CONDA_ENV/bin":$PATH fi +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # Test that OpenMP is enabled pushd test if [[ ! $(python -c "import torch; print(int(torch.backends.openmp.is_available()))") == "1" ]]; then @@ -42,6 +45,19 @@ test_python_all() { assert_git_not_dirty } +<<<<<<< HEAD +======= +test_python_mps() { + setup_test_python + + time python test/run_test.py --verbose --mps + MTL_CAPTURE_ENABLED=1 ${CONDA_RUN} python3 test/test_mps.py --verbose -k test_metal_capture + + assert_git_not_dirty +} + + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) test_python_shard() { if [[ -z "$NUM_TEST_SHARDS" ]]; then echo "NUM_TEST_SHARDS must be defined to run a Python test shard" @@ -155,6 +171,10 @@ test_jit_hooks() { torchbench_setup_macos() { git clone --recursive https://github.com/pytorch/vision torchvision git clone --recursive https://github.com/pytorch/audio torchaudio +<<<<<<< HEAD +======= + brew install jpeg-turbo libpng +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) pushd torchvision git fetch @@ -169,7 +189,12 @@ torchbench_setup_macos() { git checkout "$(cat ../.github/ci_commit_pins/audio.txt)" git submodule update --init --recursive python setup.py clean +<<<<<<< HEAD python setup.py develop +======= + #TODO: Remove me, when figure out how to make TorchAudio find brew installed openmp + USE_OPENMP=0 python setup.py develop +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) popd # Shellcheck doesn't like it when you pass no arguments to a function that can take args. See https://www.shellcheck.net/wiki/SC2120 @@ -177,9 +202,14 @@ torchbench_setup_macos() { checkout_install_torchbench } +<<<<<<< HEAD conda_benchmark_deps() { conda install -y astunparse numpy scipy ninja pyyaml setuptools cmake typing-extensions requests protobuf numba cython scikit-learn conda install -y -c conda-forge librosa +======= +pip_benchmark_deps() { + python -mpip install --no-input astunparse requests cython scikit-learn +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } @@ -187,7 +217,11 @@ test_torchbench_perf() { print_cmake_info echo "Launching torchbench setup" +<<<<<<< HEAD conda_benchmark_deps +======= + pip_benchmark_deps +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) torchbench_setup_macos TEST_REPORTS_DIR=$(pwd)/test/test-reports @@ -214,13 +248,18 @@ test_torchbench_smoketest() { print_cmake_info echo "Launching torchbench setup" +<<<<<<< HEAD conda_benchmark_deps +======= + pip_benchmark_deps +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # shellcheck disable=SC2119,SC2120 torchbench_setup_macos TEST_REPORTS_DIR=$(pwd)/test/test-reports mkdir -p "$TEST_REPORTS_DIR" +<<<<<<< HEAD local backend=eager local dtype=notset local device=mps @@ -240,6 +279,56 @@ test_torchbench_smoketest() { PYTHONPATH="$(pwd)"/torchbench python benchmarks/dynamo/torchbench.py \ --performance --only "$model" --backend "$backend" --inference --devices "$device" \ --output "$TEST_REPORTS_DIR/inductor_${backend}_torchbench_${dtype}_inference_${device}_performance.csv" +======= + local device=mps + local dtypes=(undefined float16 bfloat16 notset) + local dtype=${dtypes[$1]} + local models=(hf_T5 llama BERT_pytorch dcgan hf_GPT2 yolov3 resnet152 sam sam_fast pytorch_unet stable_diffusion_text_encoder speech_transformer Super_SloMo doctr_det_predictor doctr_reco_predictor timm_resnet timm_vovnet vgg16) + + for backend in eager inductor; do + + echo "Launching torchbench inference performance run for backend ${backend} and dtype ${dtype}" + local dtype_arg="--${dtype}" + if [ "$dtype" == notset ]; then + dtype_arg="--float32" + fi + touch "$TEST_REPORTS_DIR/inductor_${backend}_torchbench_${dtype}_inference_${device}_performance.csv" + for model in "${models[@]}"; do + PYTHONPATH="$(pwd)"/torchbench python benchmarks/dynamo/torchbench.py \ + --performance --only "$model" --backend "$backend" --inference --devices "$device" "$dtype_arg" \ + --output "$TEST_REPORTS_DIR/inductor_${backend}_torchbench_${dtype}_inference_${device}_performance.csv" || true + if [ "$backend" == "inductor" ]; then + PYTHONPATH="$(pwd)"/torchbench python benchmarks/dynamo/torchbench.py \ + --accuracy --only "$model" --backend "$backend" --inference --devices "$device" "$dtype_arg" \ + --output "$TEST_REPORTS_DIR/inductor_${backend}_torchbench_${dtype}_inference_${device}_accuracy.csv" || true + fi + done + if [ "$backend" == "inductor" ]; then + PYTHONPATH="$(pwd)"/torchbench python benchmarks/dynamo/huggingface.py \ + --performance --backend "$backend" --inference --devices "$device" "$dtype_arg" \ + --output "$TEST_REPORTS_DIR/inductor_${backend}_huggingface_${dtype}_inference_${device}_performance.csv" || true + PYTHONPATH="$(pwd)"/torchbench python benchmarks/dynamo/huggingface.py \ + --accuracy --backend "$backend" --inference --devices "$device" "$dtype_arg" \ + --output "$TEST_REPORTS_DIR/inductor_${backend}_huggingface_${dtype}_inference_${device}_accuracy.csv" || true + fi + + if [ "$dtype" == notset ]; then + for dtype_ in notset amp; do + echo "Launching torchbench training performance run for backend ${backend} and dtype ${dtype_}" + touch "$TEST_REPORTS_DIR/inductor_${backend}_torchbench_${dtype_}_training_${device}_performance.csv" + local dtype_arg="--${dtype_}" + if [ "$dtype_" == notset ]; then + dtype_arg="--float32" + fi + for model in "${models[@]}"; do + PYTHONPATH="$(pwd)"/torchbench python benchmarks/dynamo/torchbench.py \ + --performance --only "$model" --backend "$backend" --training --devices "$device" "$dtype_arg" \ + --output "$TEST_REPORTS_DIR/inductor_${backend}_torchbench_${dtype_}_training_${device}_performance.csv" || true + done + done + fi + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) done echo "Pytorch benchmark on mps device completed" @@ -249,7 +338,11 @@ test_hf_perf() { print_cmake_info TEST_REPORTS_DIR=$(pwd)/test/test-reports mkdir -p "$TEST_REPORTS_DIR" +<<<<<<< HEAD conda_benchmark_deps +======= + pip_benchmark_deps +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) torchbench_setup_macos echo "Launching HuggingFace training perf run" @@ -265,7 +358,11 @@ test_timm_perf() { print_cmake_info TEST_REPORTS_DIR=$(pwd)/test/test-reports mkdir -p "$TEST_REPORTS_DIR" +<<<<<<< HEAD conda_benchmark_deps +======= + pip_benchmark_deps +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) torchbench_setup_macos echo "Launching timm training perf run" @@ -288,7 +385,13 @@ elif [[ $TEST_CONFIG == *"perf_hf"* ]]; then elif [[ $TEST_CONFIG == *"perf_timm"* ]]; then test_timm_perf elif [[ $TEST_CONFIG == *"perf_smoketest"* ]]; then +<<<<<<< HEAD test_torchbench_smoketest +======= + test_torchbench_smoketest "${SHARD_NUMBER}" +elif [[ $TEST_CONFIG == *"mps"* ]]; then + test_python_mps +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) elif [[ $NUM_TEST_SHARDS -gt 1 ]]; then test_python_shard "${SHARD_NUMBER}" if [[ "${SHARD_NUMBER}" == 1 ]]; then diff --git a/.ci/pytorch/python_doc_push_script.sh b/.ci/pytorch/python_doc_push_script.sh index 229a4a5b5297..6c8f983b98d3 100755 --- a/.ci/pytorch/python_doc_push_script.sh +++ b/.ci/pytorch/python_doc_push_script.sh @@ -119,12 +119,15 @@ popd git rm -rf "$install_path" || true mv "$pt_checkout/docs/build/html" "$install_path" +<<<<<<< HEAD # Prevent Google from indexing $install_path/_modules. This folder contains # generated source files. # NB: the following only works on gnu sed. The sed shipped with mac os is different. # One can `brew install gnu-sed` on a mac and then use "gsed" instead of "sed". find "$install_path/_modules" -name "*.html" -print0 | xargs -0 sed -i '//a \ \ ' +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) git add "$install_path" || true git status git config user.email "soumith+bot@pytorch.org" diff --git a/.ci/pytorch/run_tests.sh b/.ci/pytorch/run_tests.sh index 6c1c55468864..87714d654de4 100755 --- a/.ci/pytorch/run_tests.sh +++ b/.ci/pytorch/run_tests.sh @@ -76,7 +76,11 @@ fi # Environment initialization if [[ "$(uname)" == Darwin ]]; then # Install the testing dependencies +<<<<<<< HEAD retry conda install -yq future hypothesis ${NUMPY_PACKAGE} ${PROTOBUF_PACKAGE} pytest setuptools six typing_extensions pyyaml +======= + retry pip install -q future hypothesis ${NUMPY_PACKAGE} ${PROTOBUF_PACKAGE} pytest setuptools six typing_extensions pyyaml +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) else retry pip install -qr requirements.txt || true retry pip install -q hypothesis protobuf pytest setuptools || true @@ -91,7 +95,10 @@ fi echo "Testing with:" pip freeze +<<<<<<< HEAD conda list || true +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ############################################################################## # Smoke tests diff --git a/.ci/pytorch/smoke_test/check_binary_symbols.py b/.ci/pytorch/smoke_test/check_binary_symbols.py index 97d6482d63bc..2663ccaab062 100755 --- a/.ci/pytorch/smoke_test/check_binary_symbols.py +++ b/.ci/pytorch/smoke_test/check_binary_symbols.py @@ -80,7 +80,11 @@ def _get_symbols_chunk(i): return functools.reduce(list.__add__, (x.result() for x in tasks), []) +<<<<<<< HEAD def check_lib_symbols_for_abi_correctness(lib: str, pre_cxx11_abi: bool = True) -> None: +======= +def check_lib_symbols_for_abi_correctness(lib: str) -> None: +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) print(f"lib: {lib}") cxx11_symbols = grep_symbols(lib, LIBTORCH_CXX11_PATTERNS) pre_cxx11_symbols = grep_symbols(lib, LIBTORCH_PRE_CXX11_PATTERNS) @@ -88,6 +92,7 @@ def check_lib_symbols_for_abi_correctness(lib: str, pre_cxx11_abi: bool = True) num_pre_cxx11_symbols = len(pre_cxx11_symbols) print(f"num_cxx11_symbols: {num_cxx11_symbols}") print(f"num_pre_cxx11_symbols: {num_pre_cxx11_symbols}") +<<<<<<< HEAD if pre_cxx11_abi: if num_cxx11_symbols > 0: raise RuntimeError( @@ -110,6 +115,14 @@ def check_lib_symbols_for_abi_correctness(lib: str, pre_cxx11_abi: bool = True) ) if num_cxx11_symbols < 100: raise RuntimeError("Didn't find enought cxx11 symbols") +======= + if num_pre_cxx11_symbols > 0: + raise RuntimeError( + f"Found pre-cxx11 symbols, but there shouldn't be any, see: {pre_cxx11_symbols[:100]}" + ) + if num_cxx11_symbols < 100: + raise RuntimeError("Didn't find enough cxx11 symbols") +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) def main() -> None: @@ -121,9 +134,14 @@ def main() -> None: else: install_root = Path(distutils.sysconfig.get_python_lib()) / "torch" +<<<<<<< HEAD libtorch_cpu_path = install_root / "lib" / "libtorch_cpu.so" pre_cxx11_abi = "cxx11-abi" not in os.getenv("DESIRED_DEVTOOLSET", "") check_lib_symbols_for_abi_correctness(libtorch_cpu_path, pre_cxx11_abi) +======= + libtorch_cpu_path = str(install_root / "lib" / "libtorch_cpu.so") + check_lib_symbols_for_abi_correctness(libtorch_cpu_path) +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if __name__ == "__main__": diff --git a/.ci/pytorch/smoke_test/check_gomp.py b/.ci/pytorch/smoke_test/check_gomp.py index 93430ff39906..a47b76201bbe 100644 --- a/.ci/pytorch/smoke_test/check_gomp.py +++ b/.ci/pytorch/smoke_test/check_gomp.py @@ -46,6 +46,12 @@ def get_gomp_thread(): # use the default gomp path of AlmaLinux OS libgomp_path = "/usr/lib64/libgomp.so.1" +<<<<<<< HEAD +======= + # if it does not exist, try Ubuntu path + if not os.path.exists(libgomp_path): + libgomp_path = f"/usr/lib/{os.uname().machine}-linux-gnu/libgomp.so.1" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) os.environ["GOMP_CPU_AFFINITY"] = "0-3" diff --git a/.ci/pytorch/smoke_test/smoke_test.py b/.ci/pytorch/smoke_test/smoke_test.py index 6f5531178319..43c2a3fb753f 100644 --- a/.ci/pytorch/smoke_test/smoke_test.py +++ b/.ci/pytorch/smoke_test/smoke_test.py @@ -7,6 +7,10 @@ import sys from pathlib import Path from tempfile import NamedTemporaryFile +<<<<<<< HEAD +======= +from typing import Optional +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) import torch import torch._dynamo @@ -195,8 +199,46 @@ def test_cuda_gds_errors_captured() -> None: ) +<<<<<<< HEAD def smoke_test_cuda( package: str, runtime_error_check: str, torch_compile_check: str +======= +def find_pypi_package_version(package: str) -> Optional[str]: + from importlib import metadata + + dists = metadata.distributions() + for dist in dists: + if dist.metadata["Name"].startswith(package): + return dist.version + return None + + +def cudnn_to_version_str(cudnn_version: int) -> str: + patch = int(cudnn_version % 10) + minor = int((cudnn_version / 100) % 100) + major = int((cudnn_version / 10000) % 10000) + return f"{major}.{minor}.{patch}" + + +def compare_pypi_to_torch_versions( + package: str, pypi_version: str, torch_version: str +) -> None: + if pypi_version is None: + raise RuntimeError(f"Can't find {package} in PyPI for Torch: {torch_version}") + if pypi_version.startswith(torch_version): + print(f"Found matching {package}. Torch: {torch_version} PyPI {pypi_version}") + else: + raise RuntimeError( + f"Wrong {package} version. Torch: {torch_version} PyPI: {pypi_version}" + ) + + +def smoke_test_cuda( + package: str, + runtime_error_check: str, + torch_compile_check: str, + pypi_pkg_check: str, +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ) -> None: if not torch.cuda.is_available() and is_cuda_system: raise RuntimeError(f"Expected CUDA {gpu_arch_ver}. However CUDA is not loaded.") @@ -226,20 +268,44 @@ def smoke_test_cuda( raise RuntimeError( f"Wrong CUDA version. Loaded: {torch.version.cuda} Expected: {gpu_arch_ver}" ) +<<<<<<< HEAD print(f"torch cuda: {torch.version.cuda}") # todo add cudnn version validation print(f"torch cudnn: {torch.backends.cudnn.version()}") print(f"cuDNN enabled? {torch.backends.cudnn.enabled}") +======= + + print(f"torch cuda: {torch.version.cuda}") +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) torch.cuda.init() print("CUDA initialized successfully") print(f"Number of CUDA devices: {torch.cuda.device_count()}") for i in range(torch.cuda.device_count()): print(f"Device {i}: {torch.cuda.get_device_name(i)}") +<<<<<<< HEAD # nccl is availbale only on Linux if sys.platform in ["linux", "linux2"]: print(f"torch nccl version: {torch.cuda.nccl.version()}") +======= + print(f"cuDNN enabled? {torch.backends.cudnn.enabled}") + torch_cudnn_version = cudnn_to_version_str(torch.backends.cudnn.version()) + print(f"Torch cuDNN version: {torch_cudnn_version}") + + if sys.platform in ["linux", "linux2"]: + torch_nccl_version = ".".join(str(v) for v in torch.cuda.nccl.version()) + print(f"Torch nccl; version: {torch_nccl_version}") + + # Pypi dependencies are installed on linux only and nccl is available only on Linux. + if pypi_pkg_check == "enabled" and sys.platform in ["linux", "linux2"]: + compare_pypi_to_torch_versions( + "cudnn", find_pypi_package_version("nvidia-cudnn"), torch_cudnn_version + ) + compare_pypi_to_torch_versions( + "nccl", find_pypi_package_version("nvidia-nccl"), torch_nccl_version + ) +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if runtime_error_check == "enabled": test_cuda_runtime_errors_captured() @@ -398,6 +464,16 @@ def parse_args(): choices=["enabled", "disabled"], default="enabled", ) +<<<<<<< HEAD +======= + parser.add_argument( + "--pypi-pkg-check", + help="Check pypi package versions cudnn and nccl", + type=str, + choices=["enabled", "disabled"], + default="enabled", + ) +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return parser.parse_args() @@ -422,7 +498,14 @@ def main() -> None: smoke_test_modules() smoke_test_cuda( +<<<<<<< HEAD options.package, options.runtime_error_check, options.torch_compile_check +======= + options.package, + options.runtime_error_check, + options.torch_compile_check, + options.pypi_pkg_check, +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ) diff --git a/.ci/pytorch/test.sh b/.ci/pytorch/test.sh index 369148833ea9..c4ca78cc6ae4 100755 --- a/.ci/pytorch/test.sh +++ b/.ci/pytorch/test.sh @@ -191,8 +191,17 @@ if [[ "$BUILD_ENVIRONMENT" == *xpu* ]]; then # shellcheck disable=SC1091 source /opt/intel/oneapi/umf/latest/env/vars.sh fi +<<<<<<< HEAD # Check XPU status before testing xpu-smi discovery +======= + # shellcheck disable=SC1091 + source /opt/intel/oneapi/ccl/latest/env/vars.sh + # shellcheck disable=SC1091 + source /opt/intel/oneapi/mpi/latest/env/vars.sh + # Check XPU status before testing + timeout 30 xpu-smi discovery || true +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) fi if [[ "$BUILD_ENVIRONMENT" != *-bazel-* ]] ; then @@ -220,7 +229,11 @@ if [[ "$BUILD_ENVIRONMENT" == *asan* ]]; then export PYTORCH_TEST_WITH_ASAN=1 export PYTORCH_TEST_WITH_UBSAN=1 # TODO: Figure out how to avoid hard-coding these paths +<<<<<<< HEAD export ASAN_SYMBOLIZER_PATH=/usr/lib/llvm-15/bin/llvm-symbolizer +======= + export ASAN_SYMBOLIZER_PATH=/usr/lib/llvm-18/bin/llvm-symbolizer +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) export TORCH_USE_RTLD_GLOBAL=1 # NB: We load libtorch.so with RTLD_GLOBAL for UBSAN, unlike our # default behavior. @@ -312,6 +325,26 @@ test_python() { assert_git_not_dirty } +<<<<<<< HEAD +======= +test_python_smoke() { + # Smoke tests for H100 + time python test/run_test.py --include test_matmul_cuda inductor/test_fp8 inductor/test_max_autotune $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running + assert_git_not_dirty +} + +test_h100_distributed() { + # Distributed tests at H100 + time python test/run_test.py --include distributed/_composable/test_composability/test_pp_composability.py $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running + # This test requires multicast support + time python test/run_test.py --include distributed/_composable/fsdp/test_fully_shard_comm.py -k TestFullyShardAllocFromPG $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running + # symmetric memory test + time python test/run_test.py --include distributed/test_symmetric_memory.py $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running + time python test/run_test.py --include distributed/test_nvshmem.py $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running + assert_git_not_dirty +} + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) test_lazy_tensor_meta_reference_disabled() { export TORCH_DISABLE_FUNCTIONALIZATION_META_REFERENCE=1 echo "Testing lazy tensor operations without meta reference" @@ -340,6 +373,20 @@ test_dynamo_wrapped_shard() { assert_git_not_dirty } +<<<<<<< HEAD +======= +test_einops() { + pip install einops==0.6.1 + time python test/run_test.py --einops --verbose --upload-artifacts-while-running + pip install einops==0.7.0 + time python test/run_test.py --einops --verbose --upload-artifacts-while-running + pip install einops==0.8.1 + time python test/run_test.py --einops --verbose --upload-artifacts-while-running + assert_git_not_dirty +} + + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) test_inductor_distributed() { # Smuggle a few multi-gpu tests here so that we don't have to request another large node echo "Testing multi_gpu tests in test_torchinductor" @@ -396,8 +443,20 @@ test_inductor_aoti() { # We need to hipify before building again python3 tools/amd_build/build_amd.py fi +<<<<<<< HEAD BUILD_AOT_INDUCTOR_TEST=1 python setup.py develop CPP_TESTS_DIR="${BUILD_BIN_DIR}" LD_LIBRARY_PATH="${TORCH_LIB_DIR}" python test/run_test.py --cpp --verbose -i cpp/test_aoti_abi_check cpp/test_aoti_inference +======= + if [[ "$BUILD_ENVIRONMENT" == *sm86* ]]; then + BUILD_AOT_INDUCTOR_TEST=1 TORCH_CUDA_ARCH_LIST=8.6 USE_FLASH_ATTENTION=OFF python setup.py develop + # TODO: Replace me completely, as one should not use conda libstdc++, nor need special path to TORCH_LIB + LD_LIBRARY_PATH=/opt/conda/envs/py_3.10/lib/:${TORCH_LIB_DIR}:$LD_LIBRARY_PATH + CPP_TESTS_DIR="${BUILD_BIN_DIR}" python test/run_test.py --cpp --verbose -i cpp/test_aoti_abi_check cpp/test_aoti_inference -dist=loadfile + else + BUILD_AOT_INDUCTOR_TEST=1 python setup.py develop + CPP_TESTS_DIR="${BUILD_BIN_DIR}" LD_LIBRARY_PATH="${TORCH_LIB_DIR}" python test/run_test.py --cpp --verbose -i cpp/test_aoti_abi_check cpp/test_aoti_inference -dist=loadfile + fi +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } test_inductor_cpp_wrapper_shard() { @@ -412,10 +471,18 @@ test_inductor_cpp_wrapper_shard() { if [[ "$1" -eq "2" ]]; then # For now, manually put the opinfo tests in shard 2, and all other tests in +<<<<<<< HEAD # shard 1. Test specific things triggering past bugs, for now. python test/run_test.py \ --include inductor/test_torchinductor_opinfo \ -k 'linalg or to_sparse' \ +======= + # shard 1. Run all CPU tests, as well as specific GPU tests triggering past + # bugs, for now. + python test/run_test.py \ + --include inductor/test_torchinductor_opinfo \ + -k 'linalg or to_sparse or TestInductorOpInfoCPU' \ +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) --verbose exit fi @@ -570,7 +637,13 @@ test_perf_for_dashboard() { local device=cuda if [[ "${TEST_CONFIG}" == *cpu* ]]; then +<<<<<<< HEAD if [[ "${TEST_CONFIG}" == *cpu_x86* ]]; then +======= + if [[ "${TEST_CONFIG}" == *zen_cpu_x86* ]]; then + device=zen_cpu_x86 + elif [[ "${TEST_CONFIG}" == *cpu_x86* ]]; then +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) device=cpu_x86 elif [[ "${TEST_CONFIG}" == *cpu_aarch64* ]]; then device=cpu_aarch64 @@ -800,6 +873,7 @@ test_inductor_torchbench_smoketest_perf() { done } +<<<<<<< HEAD test_inductor_get_core_number() { if [[ "${TEST_CONFIG}" == *aarch64* ]]; then echo "$(($(lscpu | grep 'Cluster(s):' | awk '{print $2}') * $(lscpu | grep 'Core(s) per cluster:' | awk '{print $4}')))" @@ -810,6 +884,9 @@ test_inductor_get_core_number() { test_inductor_set_cpu_affinity(){ #set jemalloc +======= +test_inductor_set_cpu_affinity(){ +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) JEMALLOC_LIB="$(find /usr/lib -name libjemalloc.so.2)" export LD_PRELOAD="$JEMALLOC_LIB":"$LD_PRELOAD" export MALLOC_CONF="oversize_threshold:1,background_thread:true,metadata_thp:auto,dirty_decay_ms:-1,muzzy_decay_ms:-1" @@ -821,14 +898,33 @@ test_inductor_set_cpu_affinity(){ export KMP_AFFINITY=granularity=fine,compact,1,0 export KMP_BLOCKTIME=1 fi +<<<<<<< HEAD cores=$(test_inductor_get_core_number) # Set number of cores to 16 on Aarch64 for performance runs. +======= + + # Use nproc here instead of lscpu because it takes into account cgroups slice + cpus=$(nproc) + thread_per_core=$(lscpu | grep 'Thread(s) per core:' | awk '{print $4}') + cores=$((cpus / thread_per_core)) + + # Set number of cores to 16 on aarch64 for performance runs +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if [[ "${TEST_CONFIG}" == *aarch64* && $cores -gt 16 ]]; then cores=16 fi export OMP_NUM_THREADS=$cores +<<<<<<< HEAD end_core=$((cores-1)) export TASKSET="taskset -c 0-$end_core" +======= + + # Handle cgroups slice start and end CPU + start_cpu=$(python -c 'import os; print(min(os.sched_getaffinity(0)))') + # Leaving one physical CPU for other tasks + end_cpu=$(($(python -c 'import os; print(max(os.sched_getaffinity(0)))') - thread_per_core)) + export TASKSET="taskset -c $start_cpu-$end_cpu" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } test_inductor_torchbench_cpu_smoketest_perf(){ @@ -1111,6 +1207,15 @@ test_custom_backend() { test_custom_script_ops() { echo "Testing custom script operators" +<<<<<<< HEAD +======= + + if [[ "$BUILD_ENVIRONMENT" == *s390x* ]]; then + echo "Skipping custom script operators until it's fixed" + return 0 + fi + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) CUSTOM_OP_BUILD="${CUSTOM_TEST_ARTIFACT_BUILD_DIR}/custom-op-build" pushd test/custom_operator cp -a "$CUSTOM_OP_BUILD" build @@ -1173,7 +1278,10 @@ build_xla() { # These functions are defined in .circleci/common.sh in pytorch/xla repo retry install_pre_deps_pytorch_xla $XLA_DIR $USE_CACHE CMAKE_PREFIX_PATH="${SITE_PACKAGES}/torch:${CMAKE_PREFIX_PATH}" XLA_SANDBOX_BUILD=1 build_torch_xla $XLA_DIR +<<<<<<< HEAD retry install_post_deps_pytorch_xla +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) assert_git_not_dirty } @@ -1475,8 +1583,11 @@ test_executorch() { export PYTHON_EXECUTABLE=python export CMAKE_ARGS="-DEXECUTORCH_BUILD_PYBIND=ON -DEXECUTORCH_BUILD_XNNPACK=ON -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON" +<<<<<<< HEAD # For llama3 bash examples/models/llama3_2_vision/install_requirements.sh +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # NB: We need to rebuild ExecuTorch runner here because it depends on PyTorch # from the PR bash .ci/scripts/setup-linux.sh --build-tool cmake @@ -1503,7 +1614,11 @@ test_executorch() { test_linux_aarch64() { python test/run_test.py --include test_modules test_mkldnn test_mkldnn_fusion test_openmp test_torch test_dynamic_shapes \ test_transformers test_multiprocessing test_numpy_interop test_autograd test_binary_ufuncs test_complex test_spectral_ops \ +<<<<<<< HEAD test_foreach test_reductions test_unary_ufuncs test_tensor_creation_ops test_ops \ +======= + test_foreach test_reductions test_unary_ufuncs test_tensor_creation_ops test_ops test_cpp_extensions_open_device_registration \ +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) --shard "$SHARD_NUMBER" "$NUM_TEST_SHARDS" --verbose # Dynamo tests @@ -1519,12 +1634,41 @@ test_linux_aarch64() { inductor/test_inplacing_pass inductor/test_kernel_benchmark inductor/test_layout_optim \ inductor/test_max_autotune inductor/test_memory_planning inductor/test_metrics inductor/test_multi_kernel inductor/test_pad_mm \ inductor/test_pattern_matcher inductor/test_perf inductor/test_profiler inductor/test_select_algorithm inductor/test_smoke \ +<<<<<<< HEAD inductor/test_split_cat_fx_passes inductor/test_standalone_compile inductor/test_torchinductor \ +======= + inductor/test_split_cat_fx_passes inductor/test_compile inductor/test_torchinductor \ +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) inductor/test_torchinductor_codegen_dynamic_shapes inductor/test_torchinductor_dynamic_shapes inductor/test_memory \ inductor/test_triton_cpu_backend inductor/test_triton_extension_backend inductor/test_mkldnn_pattern_matcher inductor/test_cpu_cpp_wrapper \ --shard "$SHARD_NUMBER" "$NUM_TEST_SHARDS" --verbose } +<<<<<<< HEAD +======= +test_operator_benchmark() { + TEST_REPORTS_DIR=$(pwd)/test/test-reports + mkdir -p "$TEST_REPORTS_DIR" + TEST_DIR=$(pwd) + + test_inductor_set_cpu_affinity + + cd benchmarks/operator_benchmark/pt_extension + python setup.py install + + cd "${TEST_DIR}"/benchmarks/operator_benchmark + $TASKSET python -m benchmark_all_test --device "$1" --tag-filter "$2" \ + --output-csv "${TEST_REPORTS_DIR}/operator_benchmark_eager_float32_cpu.csv" \ + --output-json-for-dashboard "${TEST_REPORTS_DIR}/operator_benchmark_eager_float32_cpu.json" \ + + pip_install pandas + python check_perf_csv.py \ + --actual "${TEST_REPORTS_DIR}/operator_benchmark_eager_float32_cpu.csv" \ + --expected "expected_ci_operator_benchmark_eager_float32_cpu.csv" +} + + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if ! [[ "${BUILD_ENVIRONMENT}" == *libtorch* || "${BUILD_ENVIRONMENT}" == *-bazel-* ]]; then (cd test && python -c "import torch; print(torch.__config__.show())") (cd test && python -c "import torch; print(torch.__config__.parallel_info())") @@ -1555,6 +1699,22 @@ elif [[ "$TEST_CONFIG" == distributed ]]; then if [[ "${SHARD_NUMBER}" == 1 ]]; then test_rpc fi +<<<<<<< HEAD +======= +elif [[ "${TEST_CONFIG}" == *operator_benchmark* ]]; then + TEST_MODE="short" + + if [[ "${TEST_CONFIG}" == *cpu* ]]; then + if [[ "${TEST_CONFIG}" == *long* ]]; then + TEST_MODE="long" + elif [[ "${TEST_CONFIG}" == *all* ]]; then + TEST_MODE="all" + fi + + test_operator_benchmark cpu ${TEST_MODE} + + fi +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) elif [[ "${TEST_CONFIG}" == *inductor_distributed* ]]; then test_inductor_distributed elif [[ "${TEST_CONFIG}" == *inductor-halide* ]]; then @@ -1588,7 +1748,11 @@ elif [[ "${TEST_CONFIG}" == *torchbench* ]]; then install_torchaudio cuda fi install_torchvision +<<<<<<< HEAD TORCH_CUDA_ARCH_LIST="8.0;8.6" pip_install git+https://github.com/pytorch/ao.git +======= + TORCH_CUDA_ARCH_LIST="8.0;8.6" install_torchao +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) id=$((SHARD_NUMBER-1)) # https://github.com/opencv/opencv-python/issues/885 pip_install opencv-python==4.8.0.74 @@ -1617,6 +1781,10 @@ elif [[ "${TEST_CONFIG}" == *inductor_cpp_wrapper* ]]; then install_torchvision checkout_install_torchbench hf_T5 llama moco PYTHONPATH=$(pwd)/torchbench test_inductor_cpp_wrapper_shard "$SHARD_NUMBER" +<<<<<<< HEAD +======= + test_inductor_aoti +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) elif [[ "${TEST_CONFIG}" == *inductor* ]]; then install_torchvision test_inductor_shard "${SHARD_NUMBER}" @@ -1625,6 +1793,11 @@ elif [[ "${TEST_CONFIG}" == *inductor* ]]; then test_inductor_distributed fi fi +<<<<<<< HEAD +======= +elif [[ "${TEST_CONFIG}" == *einops* ]]; then + test_einops +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) elif [[ "${TEST_CONFIG}" == *dynamo_wrapped* ]]; then install_torchvision test_dynamo_wrapped_shard "${SHARD_NUMBER}" @@ -1670,6 +1843,13 @@ elif [[ "${BUILD_ENVIRONMENT}" == *xpu* ]]; then test_python test_aten test_xpu_bin +<<<<<<< HEAD +======= +elif [[ "${TEST_CONFIG}" == smoke ]]; then + test_python_smoke +elif [[ "${TEST_CONFIG}" == h100_distributed ]]; then + test_h100_distributed +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) else install_torchvision install_monkeytype diff --git a/.ci/pytorch/test_example_code/CMakeLists.txt b/.ci/pytorch/test_example_code/CMakeLists.txt index e87f37ae61fb..688395d1615d 100644 --- a/.ci/pytorch/test_example_code/CMakeLists.txt +++ b/.ci/pytorch/test_example_code/CMakeLists.txt @@ -16,7 +16,11 @@ target_link_libraries(simple-torch-test CUDA::cudart CUDA::cufft CUDA::cusparse find_library(CUDNN_LIBRARY NAMES cudnn) target_link_libraries(simple-torch-test ${CUDNN_LIBRARY} ) if(MSVC) +<<<<<<< HEAD file(GLOB TORCH_DLLS "$ENV{CUDA_PATH}/bin/cudnn64_8.dll" "$ENV{NVTOOLSEXT_PATH}/bin/x64/*.dll") +======= + file(GLOB TORCH_DLLS "$ENV{CUDA_PATH}/bin/cudnn64_8.dll") +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) message("dlls to copy " ${TORCH_DLLS}) add_custom_command(TARGET simple-torch-test POST_BUILD diff --git a/.ci/pytorch/win-build.sh b/.ci/pytorch/win-build.sh index 7966e56695c2..44c8874e3a2d 100755 --- a/.ci/pytorch/win-build.sh +++ b/.ci/pytorch/win-build.sh @@ -31,7 +31,11 @@ PYLONG_API_CHECK=$? if [[ $PYLONG_API_CHECK == 0 ]]; then echo "Usage of PyLong_{From,As}{Unsigned}Long API may lead to overflow errors on Windows" echo "because \`sizeof(long) == 4\` and \`sizeof(unsigned long) == 4\`." +<<<<<<< HEAD echo "Please include \"torch/csrc/utils/python_numbers.h\" and use the correspoding APIs instead." +======= + echo "Please include \"torch/csrc/utils/python_numbers.h\" and use the corresponding APIs instead." +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) echo "PyLong_FromLong -> THPUtils_packInt32 / THPUtils_packInt64" echo "PyLong_AsLong -> THPUtils_unpackInt (32-bit) / THPUtils_unpackLong (64-bit)" echo "PyLong_FromUnsignedLong -> THPUtils_packUInt32 / THPUtils_packUInt64" diff --git a/.ci/pytorch/win-test-helpers/build_pytorch.bat b/.ci/pytorch/win-test-helpers/build_pytorch.bat index 297c0a689b24..9cdb0bf0cbf9 100644 --- a/.ci/pytorch/win-test-helpers/build_pytorch.bat +++ b/.ci/pytorch/win-test-helpers/build_pytorch.bat @@ -10,7 +10,11 @@ set PATH=C:\Program Files\CMake\bin;C:\Program Files\7-Zip;C:\ProgramData\chocol :: able to see what our cl.exe commands are (since you can actually :: just copy-paste them into a local Windows setup to just rebuild a :: single file.) +<<<<<<< HEAD :: log sizes are too long, but leaving this here incase someone wants to use it locally +======= +:: log sizes are too long, but leaving this here in case someone wants to use it locally +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) :: set CMAKE_VERBOSE_MAKEFILE=1 @@ -37,6 +41,14 @@ call %INSTALLER_DIR%\activate_miniconda3.bat if errorlevel 1 goto fail if not errorlevel 0 goto fail +<<<<<<< HEAD +======= +:: Update CMake +call choco upgrade -y cmake --no-progress --installargs 'ADD_CMAKE_TO_PATH=System' --apply-install-arguments-to-dependencies --version=3.27.9 +if errorlevel 1 goto fail +if not errorlevel 0 goto fail + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) call pip install mkl-include==2021.4.0 mkl-devel==2021.4.0 if errorlevel 1 goto fail if not errorlevel 0 goto fail @@ -88,7 +100,11 @@ set PATH=%CUDA_PATH%\bin;%CUDA_PATH%\libnvvp;%PATH% :cuda_build_end set DISTUTILS_USE_SDK=1 +<<<<<<< HEAD set PATH=%TMP_DIR_WIN%\bin;%PATH% +======= +set PATH=%TMP_DIR_WIN%\bin;C:\Program Files\CMake\bin;%PATH% +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) :: The latest Windows CUDA test is running on AWS G5 runner with A10G GPU if "%TORCH_CUDA_ARCH_LIST%" == "" set TORCH_CUDA_ARCH_LIST=8.6 diff --git a/.ci/pytorch/win-test-helpers/installation-helpers/install_magma.bat b/.ci/pytorch/win-test-helpers/installation-helpers/install_magma.bat index d0fbf5b20d88..9ac40c5c23bb 100644 --- a/.ci/pytorch/win-test-helpers/installation-helpers/install_magma.bat +++ b/.ci/pytorch/win-test-helpers/installation-helpers/install_magma.bat @@ -24,7 +24,11 @@ if "%CUDA_SUFFIX%" == "" ( if "%REBUILD%"=="" ( if "%BUILD_ENVIRONMENT%"=="" ( +<<<<<<< HEAD curl --retry 3 --retry-all-errors -k https://s3.amazonaws.com/ossci-windows/magma_2.5.4_%CUDA_SUFFIX%_%BUILD_TYPE%.7z --output %TMP_DIR_WIN%\magma_2.5.4_%CUDA_SUFFIX%_%BUILD_TYPE%.7z +======= + curl --retry 3 --retry-all-errors -k https://s3.amazonaws.com/ossci-windows/magma_2.5.4_%CUDA_SUFFIX%_%BUILD_TYPE%.7z --output %TMP_DIR_WIN%\magma_2.5.4_%CUDA_SUFFIX%_%BUILD_TYPE%.7z & REM @lint-ignore +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ) else ( aws s3 cp s3://ossci-windows/magma_2.5.4_%CUDA_SUFFIX%_%BUILD_TYPE%.7z %TMP_DIR_WIN%\magma_2.5.4_%CUDA_SUFFIX%_%BUILD_TYPE%.7z --quiet ) diff --git a/.ci/pytorch/win-test-helpers/run_python_nn_smoketests.py b/.ci/pytorch/win-test-helpers/run_python_nn_smoketests.py index 6df547d4a3eb..7f3f252c6eb9 100755 --- a/.ci/pytorch/win-test-helpers/run_python_nn_smoketests.py +++ b/.ci/pytorch/win-test-helpers/run_python_nn_smoketests.py @@ -52,7 +52,11 @@ if os.path.exists(debugger): command_args = [debugger, "-o", "-c", "~*g; q"] + command_args command_string = " ".join(command_args) +<<<<<<< HEAD print("Reruning with traceback enabled") +======= + print("Rerunning with traceback enabled") +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) print("Command:", command_string) subprocess.run(command_args, check=False) sys.exit(e.returncode) diff --git a/.ci/pytorch/win-test.sh b/.ci/pytorch/win-test.sh index 0426982a3ad9..7437435d55c3 100755 --- a/.ci/pytorch/win-test.sh +++ b/.ci/pytorch/win-test.sh @@ -38,7 +38,11 @@ if [[ "$BUILD_ENVIRONMENT" == *cuda* ]]; then fi # TODO: Move both of them to Windows AMI +<<<<<<< HEAD python -m pip install pytest-rerunfailures==10.3 pytest-cpp==2.3.0 tensorboard==2.13.0 pytest-subtests==0.13.1 +======= +python -m pip install pytest-rerunfailures==10.3 pytest-cpp==2.3.0 tensorboard==2.13.0 protobuf==5.29.4 pytest-subtests==0.13.1 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # Install Z3 optional dependency for Windows builds. python -m pip install z3-solver==4.12.2.0 diff --git a/.ci/pytorch/windows/arm64/bootstrap_libuv.bat b/.ci/pytorch/windows/arm64/bootstrap_libuv.bat index 33272f3ef09d..9447512ee2eb 100644 --- a/.ci/pytorch/windows/arm64/bootstrap_libuv.bat +++ b/.ci/pytorch/windows/arm64/bootstrap_libuv.bat @@ -7,7 +7,11 @@ if not exist "%DOWNLOADS_DIR%" mkdir %DOWNLOADS_DIR% if not exist "%DEPENDENCIES_DIR%" mkdir %DEPENDENCIES_DIR% :: activate visual studio +<<<<<<< HEAD call "%DEPENDENCIES_DIR%\VSBuildTools\VC\Auxiliary\Build\vcvarsall.bat" arm64 +======= +call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" arm64 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) where cl.exe cd %DEPENDENCIES_DIR% diff --git a/.ci/pytorch/windows/arm64/bootstrap_openblas.bat b/.ci/pytorch/windows/arm64/bootstrap_openblas.bat index 463e765ede12..6e87228c4542 100644 --- a/.ci/pytorch/windows/arm64/bootstrap_openblas.bat +++ b/.ci/pytorch/windows/arm64/bootstrap_openblas.bat @@ -7,7 +7,11 @@ if not exist "%DOWNLOADS_DIR%" mkdir %DOWNLOADS_DIR% if not exist "%DEPENDENCIES_DIR%" mkdir %DEPENDENCIES_DIR% :: activate visual studio +<<<<<<< HEAD call "%DEPENDENCIES_DIR%\VSBuildTools\VC\Auxiliary\Build\vcvarsall.bat" arm64 +======= +call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" arm64 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) where cl.exe :: Clone OpenBLAS diff --git a/.ci/pytorch/windows/arm64/bootstrap_tests.bat b/.ci/pytorch/windows/arm64/bootstrap_tests.bat index c0fc48702604..debac12d9740 100644 --- a/.ci/pytorch/windows/arm64/bootstrap_tests.bat +++ b/.ci/pytorch/windows/arm64/bootstrap_tests.bat @@ -2,7 +2,11 @@ cd %PYTORCH_ROOT% :: activate visual studio +<<<<<<< HEAD call "%DEPENDENCIES_DIR%\VSBuildTools\VC\Auxiliary\Build\vcvarsall.bat" arm64 +======= +call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" arm64 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) where cl.exe :: create virtual environment diff --git a/.ci/pytorch/windows/arm64/build_libtorch.bat b/.ci/pytorch/windows/arm64/build_libtorch.bat index 139e0b47be58..c2f251b5ddce 100644 --- a/.ci/pytorch/windows/arm64/build_libtorch.bat +++ b/.ci/pytorch/windows/arm64/build_libtorch.bat @@ -21,7 +21,11 @@ if %ENABLE_APL% == 1 ( ) :: activate visual studio +<<<<<<< HEAD call "%DEPENDENCIES_DIR%\VSBuildTools\VC\Auxiliary\Build\vcvarsall.bat" arm64 +======= +call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" arm64 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) where cl.exe :: change to source directory diff --git a/.ci/pytorch/windows/arm64/build_pytorch.bat b/.ci/pytorch/windows/arm64/build_pytorch.bat index b4d67b48e4fc..6f5133c32ddf 100644 --- a/.ci/pytorch/windows/arm64/build_pytorch.bat +++ b/.ci/pytorch/windows/arm64/build_pytorch.bat @@ -21,7 +21,11 @@ if %ENABLE_APL% == 1 ( ) :: activate visual studio +<<<<<<< HEAD call "%DEPENDENCIES_DIR%\VSBuildTools\VC\Auxiliary\Build\vcvarsall.bat" arm64 +======= +call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" arm64 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) where cl.exe :: change to source directory diff --git a/.ci/pytorch/windows/arm64/smoke_test.bat b/.ci/pytorch/windows/arm64/smoke_test.bat index 378413cffc85..db410f4b5139 100644 --- a/.ci/pytorch/windows/arm64/smoke_test.bat +++ b/.ci/pytorch/windows/arm64/smoke_test.bat @@ -33,7 +33,11 @@ pushd tmp set VC_VERSION_LOWER=14 set VC_VERSION_UPPER=36 +<<<<<<< HEAD call "%DEPENDENCIES_DIR%\VSBuildTools\VC\Auxiliary\Build\vcvarsall.bat" arm64 +======= +call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" arm64 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) set install_root=%CD% set INCLUDE=%INCLUDE%;%install_root%\include;%install_root%\include\torch\csrc\api\include diff --git a/.ci/pytorch/windows/build_pytorch.bat b/.ci/pytorch/windows/build_pytorch.bat index 2a1b73a527d1..de69b6ad5d29 100644 --- a/.ci/pytorch/windows/build_pytorch.bat +++ b/.ci/pytorch/windows/build_pytorch.bat @@ -1,7 +1,12 @@ @echo off +<<<<<<< HEAD :: This script parses args, installs required libraries (miniconda, MKL, :: Magma), and then delegates to cpu.bat, cuda80.bat, etc. +======= +:: This script parses args, installs required libraries (MKL, Magma, libuv) +:: and then delegates to cpu.bat, cuda80.bat, etc. +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if not "%CUDA_VERSION%" == "" if not "%PYTORCH_BUILD_VERSION%" == "" if not "%PYTORCH_BUILD_NUMBER%" == "" goto env_end if "%~1"=="" goto arg_error @@ -36,6 +41,7 @@ set DESIRED_PYTHON_PREFIX=py%DESIRED_PYTHON_PREFIX:;=;py% set SRC_DIR=%~dp0 pushd %SRC_DIR% +<<<<<<< HEAD :: Install Miniconda3 set "CONDA_HOME=%CD%\conda" set "tmp_conda=%CONDA_HOME%" @@ -49,15 +55,28 @@ set "ORIG_PATH=%PATH%" set "PATH=%CONDA_HOME%;%CONDA_HOME%\scripts;%CONDA_HOME%\Library\bin;%PATH%" :: create a new conda environment and install packages +======= +set "ORIG_PATH=%PATH%" + +:: setup build environment +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) :try SET /A tries=3 :loop IF %tries% LEQ 0 GOTO :exception +<<<<<<< HEAD call condaenv.bat IF %ERRORLEVEL% EQU 0 GOTO :done SET /A "tries=%tries%-1" :exception echo "Failed to create conda env" +======= +call setup_build.bat +IF %ERRORLEVEL% EQU 0 GOTO :done +SET /A "tries=%tries%-1" +:exception +echo "Failed to setup build environment" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) exit /B 1 :done @@ -73,7 +92,11 @@ if "%DEBUG%" == "1" ( if not "%CUDA_VERSION%" == "cpu" if not "%CUDA_VERSION%" == "xpu" ( rmdir /s /q magma_%CUDA_PREFIX%_%BUILD_TYPE% del magma_%CUDA_PREFIX%_%BUILD_TYPE%.7z +<<<<<<< HEAD curl -k https://s3.amazonaws.com/ossci-windows/magma_%MAGMA_VERSION%_%CUDA_PREFIX%_%BUILD_TYPE%.7z -o magma_%CUDA_PREFIX%_%BUILD_TYPE%.7z +======= + curl -k https://s3.amazonaws.com/ossci-windows/magma_%MAGMA_VERSION%_%CUDA_PREFIX%_%BUILD_TYPE%.7z -o magma_%CUDA_PREFIX%_%BUILD_TYPE%.7z %= @lint-ignore =% +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) 7z x -aoa magma_%CUDA_PREFIX%_%BUILD_TYPE%.7z -omagma_%CUDA_PREFIX%_%BUILD_TYPE% ) @@ -107,6 +130,7 @@ set TH_BINARY_BUILD=1 set INSTALL_TEST=0 for %%v in (%DESIRED_PYTHON_PREFIX%) do ( +<<<<<<< HEAD :: Activate Python Environment set PYTHON_PREFIX=%%v set "CONDA_LIB_PATH=%CONDA_HOME%\envs\%%v\Library\bin" @@ -115,11 +139,26 @@ for %%v in (%DESIRED_PYTHON_PREFIX%) do ( ) else ( set "PATH=%CONDA_HOME%\envs\%%v;%CONDA_HOME%\envs\%%v\scripts;%CONDA_HOME%\envs\%%v\Library\bin;%ORIG_PATH%" ) +======= + + :: Set Environment vars for the build + set "CMAKE_PREFIX_PATH=%CD%\Python\Library\;%PATH%" + set "PYTHON_LIB_PATH=%CD%\Python\Library\bin" + + if not "%ADDITIONAL_PATH%" == "" ( + set "PATH=%ADDITIONAL_PATH%;%PATH%" + ) + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) pip install ninja @setlocal :: Set Flags if not "%CUDA_VERSION%"=="cpu" if not "%CUDA_VERSION%" == "xpu" ( +<<<<<<< HEAD set MAGMA_HOME=%cd%\magma_%CUDA_PREFIX%_%BUILD_TYPE% +======= + set "MAGMA_HOME=%cd%\magma_%CUDA_PREFIX%_%BUILD_TYPE%" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ) echo "Calling arch build script" call %CUDA_PREFIX%.bat diff --git a/.ci/pytorch/windows/cuda126.bat b/.ci/pytorch/windows/cuda126.bat index efb8cfec63e7..2db616810ecb 100644 --- a/.ci/pytorch/windows/cuda126.bat +++ b/.ci/pytorch/windows/cuda126.bat @@ -18,6 +18,7 @@ REM Check for optional components set USE_CUDA= set CMAKE_GENERATOR=Visual Studio 15 2017 Win64 +<<<<<<< HEAD IF "%NVTOOLSEXT_PATH%"=="" ( IF EXIST "C:\Program Files\NVIDIA Corporation\NvToolsExt\lib\x64\nvToolsExt64_1.lib" ( set NVTOOLSEXT_PATH=C:\Program Files\NVIDIA Corporation\NvToolsExt @@ -27,6 +28,8 @@ IF "%NVTOOLSEXT_PATH%"=="" ( ) ) +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) IF "%CUDA_PATH_V126%"=="" ( IF EXIST "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.6\bin\nvcc.exe" ( set "CUDA_PATH_V126=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.6" @@ -37,7 +40,11 @@ IF "%CUDA_PATH_V126%"=="" ( ) IF "%BUILD_VISION%" == "" ( +<<<<<<< HEAD set TORCH_CUDA_ARCH_LIST=5.0;6.0;6.1;7.0;7.5;8.0;8.6;9.0 +======= + set TORCH_CUDA_ARCH_LIST=6.1;7.0;7.5;8.0;8.6;9.0 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) set TORCH_NVCC_FLAGS=-Xfatbin -compress-all ) ELSE ( set NVCC_FLAGS=-D__CUDA_NO_HALF_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_75,code=sm_75 -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_86,code=compute_86 -gencode=arch=compute_90,code=compute_90 diff --git a/.ci/pytorch/windows/cuda128.bat b/.ci/pytorch/windows/cuda128.bat index f660f1d0a699..1657918b73a3 100644 --- a/.ci/pytorch/windows/cuda128.bat +++ b/.ci/pytorch/windows/cuda128.bat @@ -18,6 +18,7 @@ REM Check for optional components set USE_CUDA= set CMAKE_GENERATOR=Visual Studio 15 2017 Win64 +<<<<<<< HEAD IF "%NVTOOLSEXT_PATH%"=="" ( IF EXIST "C:\Program Files\NVIDIA Corporation\NvToolsExt\lib\x64\nvToolsExt64_1.lib" ( set NVTOOLSEXT_PATH=C:\Program Files\NVIDIA Corporation\NvToolsExt @@ -27,6 +28,8 @@ IF "%NVTOOLSEXT_PATH%"=="" ( ) ) +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) IF "%CUDA_PATH_V128%"=="" ( IF EXIST "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.8\bin\nvcc.exe" ( set "CUDA_PATH_V128=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.8" @@ -37,7 +40,11 @@ IF "%CUDA_PATH_V128%"=="" ( ) IF "%BUILD_VISION%" == "" ( +<<<<<<< HEAD set TORCH_CUDA_ARCH_LIST=5.0;6.0;6.1;7.0;7.5;8.0;8.6;9.0;10.0;12.0 +======= + set TORCH_CUDA_ARCH_LIST=6.1;7.0;7.5;8.0;8.6;9.0;10.0;12.0 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) set TORCH_NVCC_FLAGS=-Xfatbin -compress-all ) ELSE ( set NVCC_FLAGS=-D__CUDA_NO_HALF_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_75,code=sm_75 -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_86,code=compute_86 -gencode=arch=compute_90,code=compute_90 -gencode=arch=compute_100,code=compute_100 -gencode=arch=compute_120,code=compute_120 diff --git a/.ci/pytorch/windows/cuda129.bat b/.ci/pytorch/windows/cuda129.bat new file mode 100644 index 000000000000..9ef36342f269 --- /dev/null +++ b/.ci/pytorch/windows/cuda129.bat @@ -0,0 +1,50 @@ +@echo off + +set MODULE_NAME=pytorch + +IF NOT EXIST "setup.py" IF NOT EXIST "%MODULE_NAME%" ( + call internal\clone.bat + cd %~dp0 +) ELSE ( + call internal\clean.bat +) +IF ERRORLEVEL 1 goto :eof + +call internal\check_deps.bat +IF ERRORLEVEL 1 goto :eof + +REM Check for optional components + +set USE_CUDA= +set CMAKE_GENERATOR=Visual Studio 15 2017 Win64 + +IF "%CUDA_PATH_V129%"=="" ( + IF EXIST "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.9\bin\nvcc.exe" ( + set "CUDA_PATH_V129=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.9" + ) ELSE ( + echo CUDA 12.9 not found, failing + exit /b 1 + ) +) + +IF "%BUILD_VISION%" == "" ( + set TORCH_CUDA_ARCH_LIST=7.0;7.5;8.0;8.6;9.0;10.0;12.0 + set TORCH_NVCC_FLAGS=-Xfatbin -compress-all +) ELSE ( + set NVCC_FLAGS=-D__CUDA_NO_HALF_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_75,code=sm_75 -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_86,code=compute_86 -gencode=arch=compute_90,code=compute_90 -gencode=arch=compute_100,code=compute_100 -gencode=arch=compute_120,code=compute_120 +) + +set "CUDA_PATH=%CUDA_PATH_V129%" +set "PATH=%CUDA_PATH_V129%\bin;%PATH%" + +:optcheck + +call internal\check_opts.bat +IF ERRORLEVEL 1 goto :eof + +if exist "%NIGHTLIES_PYTORCH_ROOT%" cd %NIGHTLIES_PYTORCH_ROOT%\.. +call %~dp0\internal\copy.bat +IF ERRORLEVEL 1 goto :eof + +call %~dp0\internal\setup.bat +IF ERRORLEVEL 1 goto :eof diff --git a/.ci/pytorch/windows/internal/7z_install.bat b/.ci/pytorch/windows/internal/7z_install.bat index d5a1156360d9..50522e60f9ff 100644 --- a/.ci/pytorch/windows/internal/7z_install.bat +++ b/.ci/pytorch/windows/internal/7z_install.bat @@ -1,6 +1,10 @@ @echo off +<<<<<<< HEAD curl -k https://www.7-zip.org/a/7z1805-x64.exe -O +======= +curl -k -L "https://sourceforge.net/projects/sevenzip/files/7-Zip/18.05/7z1805-x64.exe/download" -o 7z1805-x64.exe +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if errorlevel 1 exit /b 1 start /wait 7z1805-x64.exe /S diff --git a/.ci/pytorch/windows/internal/check_deps.bat b/.ci/pytorch/windows/internal/check_deps.bat index 46f438615774..0c4a65553f92 100644 --- a/.ci/pytorch/windows/internal/check_deps.bat +++ b/.ci/pytorch/windows/internal/check_deps.bat @@ -65,7 +65,11 @@ for /F "usebackq delims=" %%i in (`python -c "import sys; print('{0[0]}{0[1]}'.f if %PYVER% LSS 35 ( echo Warning: PyTorch for Python 2 under Windows is experimental. echo Python x64 3.5 or up is recommended to compile PyTorch on Windows +<<<<<<< HEAD echo Maybe you can create a virual environment if you have conda installed: +======= + echo Maybe you can create a virtual environment if you have conda installed: +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) echo ^> conda create -n test python=3.6 pyyaml numpy echo ^> activate test ) diff --git a/.ci/pytorch/windows/internal/clone.bat b/.ci/pytorch/windows/internal/clone.bat index d76d13db1763..0ee9bebba223 100644 --- a/.ci/pytorch/windows/internal/clone.bat +++ b/.ci/pytorch/windows/internal/clone.bat @@ -8,7 +8,11 @@ goto submodule :clone_pytorch +<<<<<<< HEAD git clone https://github.com/%PYTORCH_REPO%/%MODULE_NAME% +======= +git clone https://github.com/%PYTORCH_REPO%/%MODULE_NAME% & REM @lint-ignore +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) cd %MODULE_NAME% diff --git a/.ci/pytorch/windows/internal/copy.bat b/.ci/pytorch/windows/internal/copy.bat index b2d078944a5d..38fd82c25f4e 100644 --- a/.ci/pytorch/windows/internal/copy.bat +++ b/.ci/pytorch/windows/internal/copy.bat @@ -9,8 +9,12 @@ copy "%CUDA_PATH%\bin\cudnn*64_*.dll*" pytorch\torch\lib copy "%CUDA_PATH%\bin\nvrtc*64_*.dll*" pytorch\torch\lib copy "%CUDA_PATH%\extras\CUPTI\lib64\cupti64_*.dll*" pytorch\torch\lib +<<<<<<< HEAD copy "C:\Program Files\NVIDIA Corporation\NvToolsExt\bin\x64\nvToolsExt64_1.dll*" pytorch\torch\lib copy "%CONDA_LIB_PATH%\libiomp*5md.dll" pytorch\torch\lib +======= +copy "%PYTHON_LIB_PATH%\libiomp*5md.dll" pytorch\torch\lib +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) :: Should be set in build_pytorch.bat copy "%libuv_ROOT%\bin\uv.dll" pytorch\torch\lib diff --git a/.ci/pytorch/windows/internal/copy_cpu.bat b/.ci/pytorch/windows/internal/copy_cpu.bat index 864180d85dd1..1c324b600f25 100644 --- a/.ci/pytorch/windows/internal/copy_cpu.bat +++ b/.ci/pytorch/windows/internal/copy_cpu.bat @@ -1,3 +1,9 @@ +<<<<<<< HEAD copy "%CONDA_LIB_PATH%\libiomp*5md.dll" pytorch\torch\lib :: Should be set in build_pytorch.bat -copy "%libuv_ROOT%\bin\uv.dll" pytorch\torch\lib \ No newline at end of file +copy "%libuv_ROOT%\bin\uv.dll" pytorch\torch\lib +======= +copy "%PYTHON_LIB_PATH%\libiomp*5md.dll" pytorch\torch\lib +:: Should be set in build_pytorch.bat +copy "%libuv_ROOT%\bin\uv.dll" pytorch\torch\lib +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) diff --git a/.ci/pytorch/windows/internal/cuda_install.bat b/.ci/pytorch/windows/internal/cuda_install.bat index 7e33b0805c9c..87bb978d5550 100644 --- a/.ci/pytorch/windows/internal/cuda_install.bat +++ b/.ci/pytorch/windows/internal/cuda_install.bat @@ -23,14 +23,21 @@ set CUDNN_LIB_FOLDER="lib\x64" :: Skip all of this if we already have cuda installed if exist "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_STR%\bin\nvcc.exe" goto set_cuda_env_vars +<<<<<<< HEAD if %CUDA_VER% EQU 118 goto cuda118 if %CUDA_VER% EQU 124 goto cuda124 if %CUDA_VER% EQU 126 goto cuda126 if %CUDA_VER% EQU 128 goto cuda128 +======= +if %CUDA_VER% EQU 126 goto cuda126 +if %CUDA_VER% EQU 128 goto cuda128 +if %CUDA_VER% EQU 129 goto cuda129 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) echo CUDA %CUDA_VERSION_STR% is not supported exit /b 1 +<<<<<<< HEAD :cuda118 set CUDA_INSTALL_EXE=cuda_11.8.0_522.06_windows.exe @@ -83,13 +90,19 @@ curl -k -L "http://s3.amazonaws.com/ossci-windows/zlib123dllx64.zip" --output "% 7z x "%SRC_DIR%\temp_build\zlib123dllx64.zip" -o"%SRC_DIR%\temp_build\zlib" xcopy /Y "%SRC_DIR%\temp_build\zlib\dll_x64\*.dll" "C:\Windows\System32" +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) goto cuda_common :cuda126 set CUDA_INSTALL_EXE=cuda_12.6.2_560.94_windows.exe if not exist "%SRC_DIR%\temp_build\%CUDA_INSTALL_EXE%" ( +<<<<<<< HEAD curl -k -L "https://ossci-windows.s3.amazonaws.com/%CUDA_INSTALL_EXE%" --output "%SRC_DIR%\temp_build\%CUDA_INSTALL_EXE%" +======= + curl -k -L "https://ossci-windows.s3.amazonaws.com/%CUDA_INSTALL_EXE%" --output "%SRC_DIR%\temp_build\%CUDA_INSTALL_EXE%" & REM @lint-ignore +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if errorlevel 1 exit /b 1 set "CUDA_SETUP_FILE=%SRC_DIR%\temp_build\%CUDA_INSTALL_EXE%" set "ARGS=cuda_profiler_api_12.6 thrust_12.6 nvcc_12.6 cuobjdump_12.6 nvprune_12.6 nvprof_12.6 cupti_12.6 cublas_12.6 cublas_dev_12.6 cudart_12.6 cufft_12.6 cufft_dev_12.6 curand_12.6 curand_dev_12.6 cusolver_12.6 cusolver_dev_12.6 cusparse_12.6 cusparse_dev_12.6 npp_12.6 npp_dev_12.6 nvrtc_12.6 nvrtc_dev_12.6 nvml_dev_12.6 nvjitlink_12.6 nvtx_12.6" @@ -99,7 +112,11 @@ set CUDNN_FOLDER=cudnn-windows-x86_64-9.5.0.50_cuda12-archive set CUDNN_LIB_FOLDER="lib" set "CUDNN_INSTALL_ZIP=%CUDNN_FOLDER%.zip" if not exist "%SRC_DIR%\temp_build\%CUDNN_INSTALL_ZIP%" ( +<<<<<<< HEAD curl -k -L "http://s3.amazonaws.com/ossci-windows/%CUDNN_INSTALL_ZIP%" --output "%SRC_DIR%\temp_build\%CUDNN_INSTALL_ZIP%" +======= + curl -k -L "http://s3.amazonaws.com/ossci-windows/%CUDNN_INSTALL_ZIP%" --output "%SRC_DIR%\temp_build\%CUDNN_INSTALL_ZIP%" & REM @lint-ignore +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if errorlevel 1 exit /b 1 set "CUDNN_SETUP_FILE=%SRC_DIR%\temp_build\%CUDNN_INSTALL_ZIP%" ) @@ -116,7 +133,11 @@ goto cuda_common set CUDA_INSTALL_EXE=cuda_12.8.0_571.96_windows.exe if not exist "%SRC_DIR%\temp_build\%CUDA_INSTALL_EXE%" ( +<<<<<<< HEAD curl -k -L "https://ossci-windows.s3.amazonaws.com/%CUDA_INSTALL_EXE%" --output "%SRC_DIR%\temp_build\%CUDA_INSTALL_EXE%" +======= + curl -k -L "https://ossci-windows.s3.amazonaws.com/%CUDA_INSTALL_EXE%" --output "%SRC_DIR%\temp_build\%CUDA_INSTALL_EXE%" & REM @lint-ignore +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if errorlevel 1 exit /b 1 set "CUDA_SETUP_FILE=%SRC_DIR%\temp_build\%CUDA_INSTALL_EXE%" set "ARGS=cuda_profiler_api_12.8 thrust_12.8 nvcc_12.8 cuobjdump_12.8 nvprune_12.8 nvprof_12.8 cupti_12.8 cublas_12.8 cublas_dev_12.8 cudart_12.8 cufft_12.8 cufft_dev_12.8 curand_12.8 curand_dev_12.8 cusolver_12.8 cusolver_dev_12.8 cusparse_12.8 cusparse_dev_12.8 npp_12.8 npp_dev_12.8 nvrtc_12.8 nvrtc_dev_12.8 nvml_dev_12.8 nvjitlink_12.8 nvtx_12.8" @@ -126,7 +147,38 @@ set CUDNN_FOLDER=cudnn-windows-x86_64-9.7.0.66_cuda12-archive set CUDNN_LIB_FOLDER="lib" set "CUDNN_INSTALL_ZIP=%CUDNN_FOLDER%.zip" if not exist "%SRC_DIR%\temp_build\%CUDNN_INSTALL_ZIP%" ( +<<<<<<< HEAD curl -k -L "http://s3.amazonaws.com/ossci-windows/%CUDNN_INSTALL_ZIP%" --output "%SRC_DIR%\temp_build\%CUDNN_INSTALL_ZIP%" +======= + curl -k -L "http://s3.amazonaws.com/ossci-windows/%CUDNN_INSTALL_ZIP%" --output "%SRC_DIR%\temp_build\%CUDNN_INSTALL_ZIP%" & REM @lint-ignore + if errorlevel 1 exit /b 1 + set "CUDNN_SETUP_FILE=%SRC_DIR%\temp_build\%CUDNN_INSTALL_ZIP%" +) + +@REM cuDNN 8.3+ required zlib to be installed on the path +echo Installing ZLIB dlls +curl -k -L "http://s3.amazonaws.com/ossci-windows/zlib123dllx64.zip" --output "%SRC_DIR%\temp_build\zlib123dllx64.zip" +7z x "%SRC_DIR%\temp_build\zlib123dllx64.zip" -o"%SRC_DIR%\temp_build\zlib" +xcopy /Y "%SRC_DIR%\temp_build\zlib\dll_x64\*.dll" "C:\Windows\System32" + +goto cuda_common + +:cuda129 + +set CUDA_INSTALL_EXE=cuda_12.9.1_576.57_windows.exe +if not exist "%SRC_DIR%\temp_build\%CUDA_INSTALL_EXE%" ( + curl -k -L "https://ossci-windows.s3.amazonaws.com/%CUDA_INSTALL_EXE%" --output "%SRC_DIR%\temp_build\%CUDA_INSTALL_EXE%" & REM @lint-ignore + if errorlevel 1 exit /b 1 + set "CUDA_SETUP_FILE=%SRC_DIR%\temp_build\%CUDA_INSTALL_EXE%" + set "ARGS=cuda_profiler_api_12.9 thrust_12.9 nvcc_12.9 cuobjdump_12.9 nvprune_12.9 nvprof_12.9 cupti_12.9 cublas_12.9 cublas_dev_12.9 cudart_12.9 cufft_12.9 cufft_dev_12.9 curand_12.9 curand_dev_12.9 cusolver_12.9 cusolver_dev_12.9 cusparse_12.9 cusparse_dev_12.9 npp_12.9 npp_dev_12.9 nvrtc_12.9 nvrtc_dev_12.9 nvml_dev_12.9 nvjitlink_12.9 nvtx_12.9" +) + +set CUDNN_FOLDER=cudnn-windows-x86_64-9.10.2.21_cuda12-archive +set CUDNN_LIB_FOLDER="lib" +set "CUDNN_INSTALL_ZIP=%CUDNN_FOLDER%.zip" +if not exist "%SRC_DIR%\temp_build\%CUDNN_INSTALL_ZIP%" ( + curl -k -L "http://s3.amazonaws.com/ossci-windows/%CUDNN_INSTALL_ZIP%" --output "%SRC_DIR%\temp_build\%CUDNN_INSTALL_ZIP%" & REM @lint-ignore +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if errorlevel 1 exit /b 1 set "CUDNN_SETUP_FILE=%SRC_DIR%\temp_build\%CUDNN_INSTALL_ZIP%" ) @@ -145,11 +197,14 @@ goto cuda_common :: If you cannot find the CUDA version you want to build for here then please :: add it @ https://github.com/pytorch/test-infra/tree/main/aws/ami/windows if not exist "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_STR%\bin\nvcc.exe" ( +<<<<<<< HEAD if not exist "%SRC_DIR%\temp_build\NvToolsExt.7z" ( curl -k -L https://ossci-windows.s3.us-east-1.amazonaws.com/builder/NvToolsExt.7z --output "%SRC_DIR%\temp_build\NvToolsExt.7z" if errorlevel 1 exit /b 1 ) +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if not exist "%SRC_DIR%\temp_build\gpu_driver_dlls.zip" ( curl -k -L "https://ossci-windows.s3.us-east-1.amazonaws.com/builder/additional_dlls.zip" --output "%SRC_DIR%\temp_build\gpu_driver_dlls.zip" if errorlevel 1 exit /b 1 @@ -176,6 +231,7 @@ if not exist "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_ xcopy /Y "%SRC_DIR%\temp_build\cuda\CUDAVisualStudioIntegration\extras\visual_studio_integration\MSBuildExtensions\*.*" "C:\Program Files (x86)\Microsoft Visual Studio\2022\BuildTools\MSBuild\Microsoft\VC\v170\BuildCustomizations" ) +<<<<<<< HEAD echo Installing NvToolsExt... 7z x %SRC_DIR%\temp_build\NvToolsExt.7z -o"%SRC_DIR%\temp_build\NvToolsExt" mkdir "%ProgramFiles%\NVIDIA Corporation\NvToolsExt\bin\x64" @@ -185,6 +241,8 @@ if not exist "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_ xcopy /Y "%SRC_DIR%\temp_build\NvToolsExt\include\*.*" "%ProgramFiles%\NVIDIA Corporation\NvToolsExt\include" xcopy /Y "%SRC_DIR%\temp_build\NvToolsExt\lib\x64\*.*" "%ProgramFiles%\NVIDIA Corporation\NvToolsExt\lib\x64" +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) echo Installing cuDNN... 7z x %CUDNN_SETUP_FILE% -o"%SRC_DIR%\temp_build\cudnn" xcopy /Y "%SRC_DIR%\temp_build\cudnn\%CUDNN_FOLDER%\bin\*.*" "%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_STR%\bin" @@ -215,4 +273,7 @@ echo Setting up environment... set "PATH=%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_STR%\bin;%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_STR%\libnvvp;%PATH%" set "CUDA_PATH=%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_STR%" set "CUDA_PATH_V%CUDA_VER_MAJOR%_%CUDA_VER_MINOR%=%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_STR%" +<<<<<<< HEAD set "NVTOOLSEXT_PATH=%ProgramFiles%\NVIDIA Corporation\NvToolsExt" +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) diff --git a/.ci/pytorch/windows/internal/driver_update.bat b/.ci/pytorch/windows/internal/driver_update.bat index 551aa9c7a8a4..bd80f3657e75 100644 --- a/.ci/pytorch/windows/internal/driver_update.bat +++ b/.ci/pytorch/windows/internal/driver_update.bat @@ -1,5 +1,9 @@ set WIN_DRIVER_VN=528.89 +<<<<<<< HEAD set "DRIVER_DOWNLOAD_LINK=https://ossci-windows.s3.amazonaws.com/%WIN_DRIVER_VN%-data-center-tesla-desktop-winserver-2016-2019-2022-dch-international.exe" +======= +set "DRIVER_DOWNLOAD_LINK=https://ossci-windows.s3.amazonaws.com/%WIN_DRIVER_VN%-data-center-tesla-desktop-winserver-2016-2019-2022-dch-international.exe" & REM @lint-ignore +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) curl --retry 3 -kL %DRIVER_DOWNLOAD_LINK% --output %WIN_DRIVER_VN%-data-center-tesla-desktop-winserver-2016-2019-2022-dch-international.exe if errorlevel 1 exit /b 1 diff --git a/.ci/pytorch/windows/internal/install_python.bat b/.ci/pytorch/windows/internal/install_python.bat new file mode 100644 index 000000000000..642acdb3981b --- /dev/null +++ b/.ci/pytorch/windows/internal/install_python.bat @@ -0,0 +1,20 @@ +set ADDITIONAL_OPTIONS="" +set PYTHON_EXEC="python" +if "%DESIRED_PYTHON%" == "3.13t" ( + echo Python version is set to 3.13t + set "PYTHON_INSTALLER_URL=https://www.python.org/ftp/python/3.13.0/python-3.13.0-amd64.exe" + set ADDITIONAL_OPTIONS="Include_freethreaded=1" + set PYTHON_EXEC="python3.13t" +) else ( + echo DESIRED_PYTHON not defined, Python version is set to %DESIRED_PYTHON% + set "PYTHON_INSTALLER_URL=https://www.python.org/ftp/python/%DESIRED_PYTHON%.0/python-%DESIRED_PYTHON%.0-amd64.exe" %= @lint-ignore =% +) + +del python-amd64.exe +curl --retry 3 -kL "%PYTHON_INSTALLER_URL%" --output python-amd64.exe +if errorlevel 1 exit /b 1 + +start /wait "" python-amd64.exe /quiet InstallAllUsers=1 PrependPath=0 Include_test=0 %ADDITIONAL_OPTIONS% TargetDir=%CD%\Python +if errorlevel 1 exit /b 1 + +set "PATH=%CD%\Python\Scripts;%CD%\Python;%PATH%" diff --git a/.ci/pytorch/windows/internal/setup.bat b/.ci/pytorch/windows/internal/setup.bat index f57bdcbec4bc..d8db85aa23d5 100644 --- a/.ci/pytorch/windows/internal/setup.bat +++ b/.ci/pytorch/windows/internal/setup.bat @@ -51,7 +51,11 @@ mkdir libtorch\test mkdir build pushd build +<<<<<<< HEAD python ../tools/build_libtorch.py +======= +%PYTHON_EXEC% ../tools/build_libtorch.py +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) popd IF ERRORLEVEL 1 exit /b 1 @@ -86,7 +90,11 @@ copy /Y "%LIBTORCH_PREFIX%-%PYTORCH_BUILD_VERSION%.zip" "%PYTORCH_FINAL_PACKAGE_ goto build_end :pytorch +<<<<<<< HEAD python setup.py bdist_wheel -d "%PYTORCH_FINAL_PACKAGE_DIR%" +======= +%PYTHON_EXEC% setup.py bdist_wheel -d "%PYTORCH_FINAL_PACKAGE_DIR%" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) :build_end IF ERRORLEVEL 1 exit /b 1 diff --git a/.ci/pytorch/windows/internal/smoke_test.bat b/.ci/pytorch/windows/internal/smoke_test.bat index 3f9cc83cbb8d..d15997bdaaf5 100644 --- a/.ci/pytorch/windows/internal/smoke_test.bat +++ b/.ci/pytorch/windows/internal/smoke_test.bat @@ -35,6 +35,7 @@ exit /b 1 :wheel echo "install wheel package" +<<<<<<< HEAD set PYTHON_INSTALLER_URL= if "%DESIRED_PYTHON%" == "3.13t" set "PYTHON_INSTALLER_URL=https://www.python.org/ftp/python/3.13.0/python-3.13.0-amd64.exe" if "%DESIRED_PYTHON%" == "3.13" set "PYTHON_INSTALLER_URL=https://www.python.org/ftp/python/3.13.0/python-3.13.0-amd64.exe" @@ -66,6 +67,10 @@ start /wait "" python-amd64.exe /quiet InstallAllUsers=1 PrependPath=0 Include_t if errorlevel 1 exit /b 1 set "PATH=%CD%\Python%PYTHON_VERSION%\Scripts;%CD%\Python;%PATH%" +======= +call "internal\install_python.bat" + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if "%DESIRED_PYTHON%" == "3.13t" %PYTHON_EXEC% -m pip install --pre numpy==2.2.1 protobuf if "%DESIRED_PYTHON%" == "3.13" %PYTHON_EXEC% -m pip install --pre numpy==2.1.2 protobuf if "%DESIRED_PYTHON%" == "3.12" %PYTHON_EXEC% -m pip install --pre numpy==2.0.2 protobuf @@ -82,7 +87,11 @@ if "%PYTORCH_BUILD_VERSION:dev=%" NEQ "%PYTORCH_BUILD_VERSION%" ( ) set "EXTRA_INDEX= " +<<<<<<< HEAD if "%CUDA_VERSION%" == "xpu" set "EXTRA_INDEX=--index-url https://download.pytorch.org/whl/%CHANNEL%/xpu" +======= +if "%CUDA_VERSION%" == "xpu" set "EXTRA_INDEX=--index-url https://download.pytorch.org/whl/%CHANNEL%/xpu" %= @lint-ignore =% +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) for /F "delims=" %%i in ('where /R "%PYTORCH_FINAL_PACKAGE_DIR:/=\%" *.whl') do %PYTHON_EXEC% -m pip install "%%i" %EXTRA_INDEX% if errorlevel 1 exit /b 1 @@ -128,7 +137,10 @@ goto end :libtorch echo "install and test libtorch" +<<<<<<< HEAD if "%VC_YEAR%" == "2019" powershell internal\vs2019_install.ps1 +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if "%VC_YEAR%" == "2022" powershell internal\vs2022_install.ps1 if ERRORLEVEL 1 exit /b 1 @@ -140,10 +152,13 @@ pushd tmp\libtorch set VC_VERSION_LOWER=17 set VC_VERSION_UPPER=18 +<<<<<<< HEAD IF "%VC_YEAR%" == "2019" ( set VC_VERSION_LOWER=16 set VC_VERSION_UPPER=17 ) +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) for /f "usebackq tokens=*" %%i in (`"%ProgramFiles(x86)%\Microsoft Visual Studio\Installer\vswhere.exe" -legacy -products * -version [%VC_VERSION_LOWER%^,%VC_VERSION_UPPER%^) -property installationPath`) do ( if exist "%%i" if exist "%%i\VC\Auxiliary\Build\vcvarsall.bat" ( diff --git a/.ci/pytorch/windows/internal/static_lib_test.bat b/.ci/pytorch/windows/internal/static_lib_test.bat index ed8729408983..5e20ed914f07 100644 --- a/.ci/pytorch/windows/internal/static_lib_test.bat +++ b/.ci/pytorch/windows/internal/static_lib_test.bat @@ -37,7 +37,11 @@ if "%DEBUG%" == "1" ( if not "%CUDA_VERSION%" == "cpu" ( rmdir /s /q magma_%CUDA_PREFIX%_%BUILD_TYPE% del magma_%CUDA_PREFIX%_%BUILD_TYPE%.7z +<<<<<<< HEAD curl -k https://s3.amazonaws.com/ossci-windows/magma_%MAGMA_VERSION%_%CUDA_PREFIX%_%BUILD_TYPE%.7z -o magma_%CUDA_PREFIX%_%BUILD_TYPE%.7z +======= + curl -k https://s3.amazonaws.com/ossci-windows/magma_%MAGMA_VERSION%_%CUDA_PREFIX%_%BUILD_TYPE%.7z -o magma_%CUDA_PREFIX%_%BUILD_TYPE%.7z & REM @lint-ignore +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) 7z x -aoa magma_%CUDA_PREFIX%_%BUILD_TYPE%.7z -omagma_%CUDA_PREFIX%_%BUILD_TYPE% set LIB=%CD%\magma_%CUDA_PREFIX%_%BUILD_TYPE%\lib;%LIB% ) diff --git a/.ci/pytorch/windows/internal/vc_install_helper.bat b/.ci/pytorch/windows/internal/vc_install_helper.bat index 61ab6d5f8c98..bc425a224ff6 100644 --- a/.ci/pytorch/windows/internal/vc_install_helper.bat +++ b/.ci/pytorch/windows/internal/vc_install_helper.bat @@ -1,12 +1,18 @@ +<<<<<<< HEAD if "%VC_YEAR%" == "2019" powershell windows/internal/vs2019_install.ps1 +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if "%VC_YEAR%" == "2022" powershell windows/internal/vs2022_install.ps1 set VC_VERSION_LOWER=17 set VC_VERSION_UPPER=18 +<<<<<<< HEAD if "%VC_YEAR%" == "2019" ( set VC_VERSION_LOWER=16 set VC_VERSION_UPPER=17 ) +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) for /f "usebackq tokens=*" %%i in (`"%ProgramFiles(x86)%\Microsoft Visual Studio\Installer\vswhere.exe" -products Microsoft.VisualStudio.Product.BuildTools -version [%VC_VERSION_LOWER%^,%VC_VERSION_UPPER%^) -property installationPath`) do ( if exist "%%i" if exist "%%i\VC\Auxiliary\Build\vcvarsall.bat" ( diff --git a/.ci/pytorch/windows/internal/xpu_install.bat b/.ci/pytorch/windows/internal/xpu_install.bat index 94e7554cf13f..f341ac0df091 100644 --- a/.ci/pytorch/windows/internal/xpu_install.bat +++ b/.ci/pytorch/windows/internal/xpu_install.bat @@ -10,6 +10,7 @@ if not "%CUDA_VERSION%" == "xpu" ( set SRC_DIR=%NIGHTLIES_PYTORCH_ROOT% if not exist "%SRC_DIR%\temp_build" mkdir "%SRC_DIR%\temp_build" +<<<<<<< HEAD set XPU_INSTALL_MODE=%~1 if "%XPU_INSTALL_MODE%"=="" goto xpu_bundle_install_start if "%XPU_INSTALL_MODE%"=="bundle" goto xpu_bundle_install_start @@ -57,6 +58,25 @@ if not [%XPU_VERSION%]==[] if [%XPU_VERSION%]==[2025.0] ( set XPU_EXTRA_VERSION=2025.0.1+1226 set XPU_EXTRA_INSTALLED=0 set XPU_EXTRA_UNINSTALL=0 +======= +:xpu_bundle_install_start + +set XPU_BUNDLE_PARENT_DIR=C:\Program Files (x86)\Intel\oneAPI +set XPU_BUNDLE_URL=https://registrationcenter-download.intel.com/akdlm/IRC_NAS/9d6d6c17-ca2d-4735-9331-99447e4a1280/intel-deep-learning-essentials-2025.0.1.28_offline.exe +set XPU_BUNDLE_PRODUCT_NAME=intel.oneapi.win.deep-learning-essentials.product +set XPU_BUNDLE_VERSION=2025.0.1+20 +set XPU_BUNDLE_INSTALLED=0 +set XPU_BUNDLE_UNINSTALL=0 +set XPU_EXTRA_URL=NULL +set XPU_EXTRA_PRODUCT_NAME=intel.oneapi.win.compiler.product +set XPU_EXTRA_VERSION=2025.0.1+1226 +set XPU_EXTRA_INSTALLED=0 +set XPU_EXTRA_UNINSTALL=0 + +if not [%XPU_VERSION%]==[] if [%XPU_VERSION%]==[2025.1] ( + set XPU_BUNDLE_URL=https://registrationcenter-download.intel.com/akdlm/IRC_NAS/75d4eb97-914a-4a95-852c-7b9733d80f74/intel-deep-learning-essentials-2025.1.3.8_offline.exe + set XPU_BUNDLE_VERSION=2025.1.3+5 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ) :: Check if XPU bundle is target version or already installed diff --git a/.ci/pytorch/windows/setup_build.bat b/.ci/pytorch/windows/setup_build.bat new file mode 100644 index 000000000000..9b492eef664d --- /dev/null +++ b/.ci/pytorch/windows/setup_build.bat @@ -0,0 +1,27 @@ +IF "%DESIRED_PYTHON%"=="" ( + echo DESIRED_PYTHON is NOT defined. + exit /b 1 +) + +call "internal\install_python.bat" + +%PYTHON_EXEC% --version +set "PATH=%CD%\Python\Lib\site-packages\cmake\data\bin;%CD%\Python\Scripts;%CD%\Python;%PATH%" +if "%DESIRED_PYTHON%" == "3.13t" %PYTHON_EXEC% -m pip install numpy==2.2.1 cmake +if "%DESIRED_PYTHON%" == "3.13" %PYTHON_EXEC% -m pip install numpy==2.1.2 cmake +if "%DESIRED_PYTHON%" == "3.12" %PYTHON_EXEC% -m pip install numpy==2.0.2 cmake +if "%DESIRED_PYTHON%" == "3.11" %PYTHON_EXEC% -m pip install numpy==2.0.2 cmake +if "%DESIRED_PYTHON%" == "3.10" %PYTHON_EXEC% -m pip install numpy==2.0.2 cmake +if "%DESIRED_PYTHON%" == "3.9" %PYTHON_EXEC% -m pip install numpy==2.0.2 cmake + +%PYTHON_EXEC% -m pip install pyyaml +%PYTHON_EXEC% -m pip install mkl-include mkl-static +%PYTHON_EXEC% -m pip install boto3 ninja typing_extensions setuptools==72.1.0 + +where cmake.exe + +:: Install libuv +curl -k https://s3.amazonaws.com/ossci-windows/libuv-1.40.0-h8ffe710_0.tar.bz2 -o libuv-1.40.0-h8ffe710_0.tar.bz2 +7z x -aoa libuv-1.40.0-h8ffe710_0.tar.bz2 +tar -xvf libuv-1.40.0-h8ffe710_0.tar -C %CD%\Python\ +set libuv_ROOT=%CD%\Python\Library diff --git a/.ci/pytorch/windows/xpu.bat b/.ci/pytorch/windows/xpu.bat index f9f5d9833839..975c7bd5bb3d 100644 --- a/.ci/pytorch/windows/xpu.bat +++ b/.ci/pytorch/windows/xpu.bat @@ -26,6 +26,10 @@ set VS2022INSTALLDIR=%VS15INSTALLDIR% set XPU_BUNDLE_ROOT=%ProgramFiles(x86)%\Intel\oneAPI call "%XPU_BUNDLE_ROOT%\compiler\latest\env\vars.bat" call "%XPU_BUNDLE_ROOT%\ocloc\latest\env\vars.bat" +<<<<<<< HEAD +======= +set USE_ONEMKL=1 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) IF ERRORLEVEL 1 goto :eof if exist "%NIGHTLIES_PYTORCH_ROOT%" cd %NIGHTLIES_PYTORCH_ROOT%\.. diff --git a/.ci/wheel/build_wheel.sh b/.ci/wheel/build_wheel.sh index b6b0d978cc23..4465c9184f25 100755 --- a/.ci/wheel/build_wheel.sh +++ b/.ci/wheel/build_wheel.sh @@ -206,7 +206,11 @@ if [[ "$USE_SPLIT_BUILD" == "true" ]]; then BUILD_LIBTORCH_WHL=1 BUILD_PYTHON_ONLY=0 python setup.py bdist_wheel -d "$whl_tmp_dir" echo "Finished setup.py bdist_wheel for split build (BUILD_LIBTORCH_WHL)" echo "Calling setup.py bdist_wheel for split build (BUILD_PYTHON_ONLY)" +<<<<<<< HEAD BUILD_PYTHON_ONLY=1 BUILD_LIBTORCH_WHL=0 python setup.py bdist_wheel -d "$whl_tmp_dir" --cmake +======= + BUILD_LIBTORCH_WHL=0 BUILD_PYTHON_ONLY=1 CMAKE_FRESH=1 python setup.py bdist_wheel -d "$whl_tmp_dir" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) echo "Finished setup.py bdist_wheel for split build (BUILD_PYTHON_ONLY)" else python setup.py bdist_wheel -d "$whl_tmp_dir" diff --git a/.circleci/scripts/binary_linux_test.sh b/.circleci/scripts/binary_linux_test.sh index b2db95ab62fa..f677aaca80c3 100755 --- a/.circleci/scripts/binary_linux_test.sh +++ b/.circleci/scripts/binary_linux_test.sh @@ -90,11 +90,28 @@ fi /pytorch/.ci/pytorch/check_binary.sh if [[ "\$GPU_ARCH_TYPE" != *s390x* && "\$GPU_ARCH_TYPE" != *xpu* && "\$GPU_ARCH_TYPE" != *rocm* && "$PACKAGE_TYPE" != libtorch ]]; then +<<<<<<< HEAD # Exclude s390, xpu, rocm and libtorch builds from smoke testing python /pytorch/.ci/pytorch/smoke_test/smoke_test.py --package=torchonly --torch-compile-check disabled if [[ "\$GPU_ARCH_TYPE" != *cpu-aarch64* ]]; then # test for issue https://github.com/pytorch/pytorch/issues/149422 +======= + + torch_pkg_size="$(ls -1 /final_pkgs/torch-* | sort |tail -1 |xargs wc -c |cut -d ' ' -f1)" + # todo: implement check for large binaries + # if the package is larger than 1.5GB, we disable the pypi check. + # this package contains all libraries packaged in torch libs folder + # example of such package is https://download.pytorch.org/whl/cu126_full/torch + if [[ "\$torch_pkg_size" -gt 1500000000 ]]; then + python /pytorch/.ci/pytorch/smoke_test/smoke_test.py --package=torchonly --torch-compile-check disabled --pypi-pkg-check disabled + else + python /pytorch/.ci/pytorch/smoke_test/smoke_test.py --package=torchonly --torch-compile-check disabled $extra_parameters + fi + + if [[ "\$GPU_ARCH_TYPE" != *cpu-aarch64* ]]; then + # https://github.com/pytorch/pytorch/issues/149422 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) python /pytorch/.ci/pytorch/smoke_test/check_gomp.py fi fi diff --git a/.circleci/scripts/binary_populate_env.sh b/.circleci/scripts/binary_populate_env.sh index 67c69ba7e3ce..be2f3bc63353 100755 --- a/.circleci/scripts/binary_populate_env.sh +++ b/.circleci/scripts/binary_populate_env.sh @@ -79,8 +79,13 @@ TRITON_VERSION=$(cat $PYTORCH_ROOT/.ci/docker/triton_version.txt) # Here PYTORCH_EXTRA_INSTALL_REQUIREMENTS is already set for the all the wheel builds hence append TRITON_CONSTRAINT TRITON_CONSTRAINT="platform_system == 'Linux' and platform_machine == 'x86_64'" +<<<<<<< HEAD # CUDA 12.8 builds have triton for Linux and Linux aarch64 binaries. if [[ "$DESIRED_CUDA" == cu128 ]]; then +======= +# CUDA 12.9 builds have triton for Linux and Linux aarch64 binaries. +if [[ "$DESIRED_CUDA" == "cu129" ]]; then +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) TRITON_CONSTRAINT="platform_system == 'Linux'" fi @@ -109,6 +114,10 @@ fi # Set triton via PYTORCH_EXTRA_INSTALL_REQUIREMENTS for triton xpu package if [[ "$PACKAGE_TYPE" =~ .*wheel.* && -n "$PYTORCH_BUILD_VERSION" && "$PYTORCH_BUILD_VERSION" =~ .*xpu.* ]]; then +<<<<<<< HEAD +======= + TRITON_VERSION=$(cat $PYTORCH_ROOT/.ci/docker/triton_xpu_version.txt) +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) TRITON_REQUIREMENT="pytorch-triton-xpu==${TRITON_VERSION}" if [[ -n "$PYTORCH_BUILD_VERSION" && "$PYTORCH_BUILD_VERSION" =~ .*dev.* ]]; then TRITON_SHORTHASH=$(cut -c1-8 $PYTORCH_ROOT/.ci/docker/ci_commit_pins/triton-xpu.txt) diff --git a/.circleci/scripts/binary_upload.sh b/.circleci/scripts/binary_upload.sh index 28140b832028..b97bf85fb51d 100755 --- a/.circleci/scripts/binary_upload.sh +++ b/.circleci/scripts/binary_upload.sh @@ -55,12 +55,23 @@ s3_upload() { s3_upload_dir="${s3_root_dir}/${UPLOAD_SUBFOLDER}/" fi ( +<<<<<<< HEAD +======= + cache_control_flag="" + if [[ "${UPLOAD_CHANNEL}" = "test" ]]; then + cache_control_flag="--cache-control='no-cache,no-store,must-revalidate'" + fi +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) for pkg in ${PKG_DIR}/*.${extension}; do ( set -x shm_id=$(sha256sum "${pkg}" | awk '{print $1}') ${AWS_S3_CP} --no-progress --acl public-read "${pkg}" "${s3_upload_dir}" \ +<<<<<<< HEAD --metadata "checksum-sha256=${shm_id}" +======= + --metadata "checksum-sha256=${shm_id}" ${cache_control_flag} +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ) done ) diff --git a/.circleci/scripts/binary_windows_build.sh b/.circleci/scripts/binary_windows_build.sh index 2d618ac53082..cc39e6cad175 100644 --- a/.circleci/scripts/binary_windows_build.sh +++ b/.circleci/scripts/binary_windows_build.sh @@ -4,16 +4,30 @@ set -eux -o pipefail source "${BINARY_ENV_FILE:-/c/w/env}" mkdir -p "$PYTORCH_FINAL_PACKAGE_DIR" +<<<<<<< HEAD export CUDA_VERSION="${DESIRED_CUDA/cu/}" export USE_SCCACHE=1 export SCCACHE_BUCKET=ossci-compiler-cache export SCCACHE_IGNORE_SERVER_IO_ERROR=1 export VC_YEAR=2019 +======= +if [[ "$OS" != "windows-arm64" ]]; then + export CUDA_VERSION="${DESIRED_CUDA/cu/}" + export USE_SCCACHE=1 + export SCCACHE_BUCKET=ossci-compiler-cache + export SCCACHE_IGNORE_SERVER_IO_ERROR=1 + export VC_YEAR=2022 +fi +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if [[ "$DESIRED_CUDA" == 'xpu' ]]; then export VC_YEAR=2022 export USE_SCCACHE=0 +<<<<<<< HEAD export XPU_VERSION=2025.0 +======= + export XPU_VERSION=2025.1 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) export XPU_ENABLE_KINETO=1 fi @@ -22,7 +36,20 @@ df -h pushd "$PYTORCH_ROOT/.ci/pytorch/" export NIGHTLIES_PYTORCH_ROOT="$PYTORCH_ROOT" +<<<<<<< HEAD ./windows/internal/build_wheels.bat +======= + +if [[ "$OS" == "windows-arm64" ]]; then + if [[ "$PACKAGE_TYPE" == 'libtorch' ]]; then + ./windows/arm64/build_libtorch.bat + elif [[ "$PACKAGE_TYPE" == 'wheel' ]]; then + ./windows/arm64/build_pytorch.bat + fi +else + ./windows/internal/build_wheels.bat +fi +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) echo "Free space on filesystem after build:" df -h diff --git a/.circleci/scripts/binary_windows_test.sh b/.circleci/scripts/binary_windows_test.sh index 5e44ef0427c1..23521a6d31a1 100644 --- a/.circleci/scripts/binary_windows_test.sh +++ b/.circleci/scripts/binary_windows_test.sh @@ -4,6 +4,7 @@ set -eux -o pipefail source "${BINARY_ENV_FILE:-/c/w/env}" export CUDA_VERSION="${DESIRED_CUDA/cu/}" +<<<<<<< HEAD export VC_YEAR=2019 if [[ "$DESIRED_CUDA" == 'xpu' ]]; then @@ -13,5 +14,21 @@ fi pushd "$PYTORCH_ROOT/.ci/pytorch/" ./windows/internal/smoke_test.bat +======= +export VC_YEAR=2022 + +if [[ "$DESIRED_CUDA" == 'xpu' ]]; then + export VC_YEAR=2022 + export XPU_VERSION=2025.1 +fi + +pushd "$PYTORCH_ROOT/.ci/pytorch/" + +if [[ "$OS" == "windows-arm64" ]]; then + ./windows/arm64/smoke_test.bat +else + ./windows/internal/smoke_test.bat +fi +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) popd diff --git a/.clang-tidy b/.clang-tidy index a45142433ef7..187bcd046dc3 100644 --- a/.clang-tidy +++ b/.clang-tidy @@ -48,12 +48,18 @@ misc-*, -misc-no-recursion, -misc-non-private-member-variables-in-classes, -misc-unused-using-decls, +<<<<<<< HEAD -misc-use-internal-linkage, +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) modernize-*, -modernize-macro-to-enum, -modernize-return-braced-init-list, -modernize-use-auto, +<<<<<<< HEAD -modernize-use-default-member-init, +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) -modernize-use-using, -modernize-use-trailing-return-type, -modernize-use-nodiscard, diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile index e151576219af..343acae90cfe 100644 --- a/.devcontainer/Dockerfile +++ b/.devcontainer/Dockerfile @@ -1,3 +1,4 @@ +<<<<<<< HEAD FROM mcr.microsoft.com/vscode/devcontainers/miniconda:0-3 # I am suprised this is needed @@ -12,10 +13,36 @@ RUN if [ -f "/tmp/conda-tmp/environment.yml" ]; then umask 0002 && /opt/conda/bi # Tools needed for llvm RUN sudo apt-get -y update RUN sudo apt install -y lsb-release wget software-properties-common gnupg +======= +FROM mcr.microsoft.com/vscode/devcontainers/base:ubuntu-22.04 + +# Tools needed for development +RUN apt-get -y update && \ + apt-get install -y \ + build-essential \ + cmake \ + ninja-build \ + git \ + python3 \ + python3-pip \ + python3-dev \ + python3-venv \ + libopenblas-dev + +# Tools needed for llvm +RUN apt-get install --no-install-recommends -y lsb-release wget software-properties-common gnupg && \ + sudo apt-get clean -y + +# Create Python virtual environment +# RUN python3 -m venv /opt/venv +# ENV PATH="/opt/venv/bin:$PATH" +RUN pip3 install --upgrade pip +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # Install CLANG if version is specified ARG CLANG_VERSION RUN if [ -n "$CLANG_VERSION" ]; then \ +<<<<<<< HEAD sudo wget https://apt.llvm.org/llvm.sh; \ chmod +x llvm.sh; \ sudo ./llvm.sh "${CLANG_VERSION}"; \ @@ -32,3 +59,29 @@ ARG CUDA_VERSION RUN if [ -n "$CUDA_VERSION" ]; then \ conda install -y cuda -c "nvidia/label/cuda-${CUDA_VERSION}"; \ fi +======= + wget https://apt.llvm.org/llvm.sh; \ + chmod +x llvm.sh; \ + ./llvm.sh "${CLANG_VERSION}"; \ + echo 'export CC=clang' >> ~/.bashrc; \ + echo 'export CXX=clang++' >> ~/.bashrc; \ + apt-get install --no-install-recommends -y clang libomp-dev && \ + apt-get clean -y; \ + fi + + +# Install CUDA if version is specified +ARG CUDA_VERSION +RUN if [ -n "$CUDA_VERSION" ]; then \ + CUDA_REPO_VERSION=$(echo ${CUDA_VERSION} | sed 's/\./\-/g'); \ + wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.0-1_all.deb && \ + dpkg -i cuda-keyring_1.0-1_all.deb && \ + apt-get install --no-install-recommends -y cuda-toolkit-${CUDA_VERSION} && \ + apt-get clean -y; \ + fi + +# Set PATH for CUDA +ENV PATH="/usr/local/cuda/bin:${PATH}" +ENV LD_LIBRARY_PATH="/usr/local/cuda/lib64:${LD_LIBRARY_PATH}" +ENV PIP_BREAK_SYSTEM_PACKAGES=1 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) diff --git a/.devcontainer/cpu/devcontainer.json b/.devcontainer/cpu/devcontainer.json index aaca1e0e9066..c2483e731890 100644 --- a/.devcontainer/cpu/devcontainer.json +++ b/.devcontainer/cpu/devcontainer.json @@ -3,7 +3,11 @@ { "name": "PyTorch - CPU", "build": { +<<<<<<< HEAD "context": "../..", +======= + "context": "./", +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) "dockerfile": "../Dockerfile", "args": { "USERNAME": "vscode", @@ -11,6 +15,15 @@ "CLANG_VERSION": "" } }, +<<<<<<< HEAD +======= + // Mount the full repo only after the container starts + "workspaceMount": "source=${localWorkspaceFolder},target=/workspace/pytorch,type=bind,consistency=cached", + "workspaceFolder": "/workspace/pytorch", + "containerEnv": { + "PIP_USER": "0" // <‑‑ disable implicit --user + }, +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // Features to add to the dev container. More info: https://containers.dev/features. "features": { diff --git a/.devcontainer/cuda/devcontainer.json b/.devcontainer/cuda/devcontainer.json index b0d448b8dc47..757e7c851de4 100644 --- a/.devcontainer/cuda/devcontainer.json +++ b/.devcontainer/cuda/devcontainer.json @@ -3,16 +3,34 @@ { "name": "PyTorch - CUDA", "build": { +<<<<<<< HEAD "context": "../..", +======= + "context": "./", +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) "dockerfile": "../Dockerfile", "args": { "USERNAME": "vscode", "BUILDKIT_INLINE_CACHE": "0", +<<<<<<< HEAD "CUDA_VERSION": "11.8.0", "CLANG_VERSION": "" } }, "runArgs": ["--gpus", "all"], +======= + "CUDA_VERSION": "12.8.0", + "CLANG_VERSION": "" + } + }, + "runArgs": ["--runtime", "nvidia", "--gpus", "all"], + // Mount the full repo only after the container starts + "workspaceMount": "source=${localWorkspaceFolder},target=/workspace/pytorch,type=bind,consistency=cached", + "workspaceFolder": "/workspace/pytorch", + "containerEnv": { + "PIP_USER": "0" // <‑‑ disable implicit --user + }, +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // Use 'forwardPorts' to make a list of ports inside the container available locally. // "forwardPorts": [], diff --git a/.devcontainer/cuda/requirements.txt b/.devcontainer/cuda/requirements.txt new file mode 100644 index 000000000000..eb0a8b9c2b19 --- /dev/null +++ b/.devcontainer/cuda/requirements.txt @@ -0,0 +1,2 @@ +cmake +ninja \ No newline at end of file diff --git a/.devcontainer/scripts/install-dev-tools.sh b/.devcontainer/scripts/install-dev-tools.sh index f33f294645e7..b03b0e47ca57 100644 --- a/.devcontainer/scripts/install-dev-tools.sh +++ b/.devcontainer/scripts/install-dev-tools.sh @@ -8,6 +8,12 @@ git submodule update --init --recursive make setup-lint # Add CMAKE_PREFIX_PATH to bashrc +<<<<<<< HEAD echo 'export CMAKE_PREFIX_PATH=${CONDA_PREFIX:-"$(dirname $(which conda))/../"}' >> ~/.bashrc # Add linker path so that cuda-related libraries can be found echo 'export LDFLAGS="-L${CONDA_PREFIX}/lib/ $LDFLAGS"' >> ~/.bashrc +======= +echo 'export CMAKE_PREFIX_PATH=/usr/local' >> ~/.bashrc +# Add linker path so that cuda-related libraries can be found +echo 'export LDFLAGS="-L/usr/local/cuda/lib64/ $LDFLAGS"' >> ~/.bashrc +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) diff --git a/.editorconfig b/.editorconfig new file mode 100644 index 000000000000..74c90164422d --- /dev/null +++ b/.editorconfig @@ -0,0 +1,14 @@ +root = true + +[*] +end_of_line = lf +insert_final_newline = true + +# Python +[*.py] +indent_style = space +indent_size = 4 + +# Make +[Makefile] +indent_style = tab diff --git a/.flake8 b/.flake8 index c30f95886924..89f8318c6e52 100644 --- a/.flake8 +++ b/.flake8 @@ -19,6 +19,11 @@ ignore = G100,G101,G200 # these ignores are from flake8-simplify. please fix or ignore with commented reason SIM105,SIM108,SIM110,SIM111,SIM113,SIM114,SIM115,SIM116,SIM117,SIM118,SIM119,SIM12, +<<<<<<< HEAD +======= + # SIM104 is already covered by pyupgrade ruff + SIM104, +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # flake8-simplify code styles SIM102,SIM103,SIM106,SIM112, # TorchFix codes that don't make sense for PyTorch itself: diff --git a/.github/ISSUE_TEMPLATE/bug-report.yml b/.github/ISSUE_TEMPLATE/bug-report.yml index 458f283507fc..92e02d2ac274 100644 --- a/.github/ISSUE_TEMPLATE/bug-report.yml +++ b/.github/ISSUE_TEMPLATE/bug-report.yml @@ -12,7 +12,13 @@ body: description: | Please provide a clear and concise description of what the bug is. +<<<<<<< HEAD If relevant, add a minimal example so that we can reproduce the error by running the code. It is very important for the snippet to be as succinct (minimal) as possible, so please take time to trim down any irrelevant code to help us debug efficiently. We are going to copy-paste your code and we expect to get the same result as you did: avoid any external data, and include the relevant imports, etc. For example: +======= + If relevant, add a minimal example so that we can reproduce the error by running the code. It is very important for the snippet to be as succinct (minimal) as possible, so please take time to trim down any irrelevant code to help us debug efficiently. + Your example should be fully self-contained and not rely on any artifact that should be downloaded. + For example: +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ```python # All necessary imports at the beginning @@ -26,6 +32,10 @@ body: If the code is too long (hopefully, it isn't), feel free to put it in a public gist and link it in the issue: https://gist.github.com. Please also paste or describe the results you observe instead of the expected results. If you observe an error, please paste the error message including the **full** traceback of the exception. It may be relevant to wrap error messages in ```` ```triple quotes blocks``` ````. +<<<<<<< HEAD +======= + If your issue is related to numerical accuracy or reproducibility, please read the [numerical accuracy](https://docs.pytorch.org/docs/stable/notes/numerical_accuracy.html) and [reproducibility](https://docs.pytorch.org/docs/stable/notes/randomness.html) notes. If the difference is not expected as described in these documents, please provide appropriate justification on why one result is wrong and the other is correct. +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) placeholder: | A clear and concise description of what the bug is. diff --git a/.github/ISSUE_TEMPLATE/disable-ci-jobs.md b/.github/ISSUE_TEMPLATE/disable-ci-jobs.md index b4b078badb34..8bdcc8a2c085 100644 --- a/.github/ISSUE_TEMPLATE/disable-ci-jobs.md +++ b/.github/ISSUE_TEMPLATE/disable-ci-jobs.md @@ -5,7 +5,11 @@ title: "DISABLED [WORKFLOW_NAME] / [PLATFORM_NAME] / [JOB_NAME]" labels: "module: ci" --- +<<<<<<< HEAD > For example, DISABLED pull / win-vs2019-cpu-py3 / test (default). Once +======= +> For example, DISABLED pull / win-vs2022-cpu-py3 / test (default). Once +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) > created, the job will be disabled within 15 minutes. You can check the > list of disabled jobs at https://ossci-metrics.s3.amazonaws.com/disabled-jobs.json diff --git a/.github/ISSUE_TEMPLATE/pt2-bug-report.yml b/.github/ISSUE_TEMPLATE/pt2-bug-report.yml index be22b1446b4e..e6ea0b4b82a9 100644 --- a/.github/ISSUE_TEMPLATE/pt2-bug-report.yml +++ b/.github/ISSUE_TEMPLATE/pt2-bug-report.yml @@ -20,7 +20,11 @@ body: - Don't compare indices of max/min etc, because that avoids the above requirement +<<<<<<< HEAD - If comparing eager and torch.compile at fp16/bf16, you should use fp32 as baseline +======= + - When comparing eager and torch.compile, use a higher precision result as a baseline. `torch._dynamo.utils.same` with fp64_ref will handle this comparison. +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - Ensure rng state used to compare results is equivalent. Use `torch._inductor.config.fallback_random=True` and reset the torch rng seed between comparisons diff --git a/.github/ISSUE_TEMPLATE/release-feature-request.yml b/.github/ISSUE_TEMPLATE/release-feature-request.yml new file mode 100644 index 000000000000..80f10807ae56 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/release-feature-request.yml @@ -0,0 +1,111 @@ +name: 🚀 Release highlight for proposed Feature +description: Submit a Release highlight for proposed Feature +labels: ["release-feature-request"] + +body: +- type: textarea + attributes: + label: Release highlight for proposed Feature + description: > + Example: “A torch.special module, analogous to SciPy's special module.” +- type: input + id: contact + attributes: + label: Point(s) of contact + description: How can we get in touch with you if we need more info? + placeholder: ex. github username + validations: + required: false +- type: dropdown + attributes: + label: Release Mode (pytorch/pytorch features only) + description: | + If "out-of-tree", please include the GH repo name + options: + - In-tree + - Out-of-tree + validations: + required: true +- type: textarea + attributes: + label: Out-Of-Tree Repo + description: > + please include the GH repo name + validations: + required: false +- type: textarea + attributes: + label: Description and value to the user + description: > + Please provide a brief description of the feature and how it will benefit the user. + validations: + required: false +- type: textarea + attributes: + label: Link to design doc, GitHub issues, past submissions, etc + validations: + required: false +- type: textarea + attributes: + label: What feedback adopters have provided + description: > + Please list users/teams that have tried the feature and provided feedback. If that feedback motivated material changes (API, doc, etc..), a quick overview of the changes and the status (planned, in progress, implemented) would be helpful as well. + validations: + required: false +- type: dropdown + attributes: + label: Plan for documentations / tutorials + description: | + Select One of the following options + options: + - Tutorial exists + - Will submit a PR to pytorch/tutorials + - Will submit a PR to a repo + - Tutorial is not needed + validations: + required: true +- type: textarea + attributes: + label: Additional context for tutorials + description: > + Please provide a link for existing tutorial or link to a repo or context for why tutorial is not needed. + validations: + required: false +- type: dropdown + attributes: + label: Marketing/Blog Coverage + description: | + Are you requesting feature Inclusion in the release blogs? + options: + - "Yes" + - "No" + validations: + required: true +- type: textarea + attributes: + label: Are you requesting other marketing assistance with this feature? + description: > + E.g. supplementary blogs, social media amplification, etc. + validations: + required: false +- type: textarea + attributes: + label: Release Version + description: > + Please include release version for marketing coverage. + validations: + required: false +- type: textarea + attributes: + label: OS / Platform / Compute Coverage + description: > + Please list the platforms supported by the proposed feature. If the feature supports all the platforms, write "all". Goal of this section is to clearly share if this feature works in all PyTorch configurations or is it limited to only certain platforms/configurations (e.g. CPU only, GPU only, Linux only, etc...) + validations: + required: false +- type: textarea + attributes: + label: Testing Support (CI, test cases, etc..) + description: > + Please provide an overview of test coverage. This includes unit testing and integration testing, but if E2E validation testing has been done to show that the feature works for a certain set of use cases or models please mention that as well. + validations: + required: false diff --git a/.github/actionlint.yaml b/.github/actionlint.yaml index 35e1323ab8b8..370a9b7ddd11 100644 --- a/.github/actionlint.yaml +++ b/.github/actionlint.yaml @@ -14,6 +14,10 @@ self-hosted-runner: - linux.12xlarge - linux.24xlarge - linux.24xlarge.ephemeral +<<<<<<< HEAD +======= + - linux.24xlarge.amd +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - linux.arm64.2xlarge - linux.arm64.2xlarge.ephemeral - linux.arm64.m7g.4xlarge @@ -45,10 +49,24 @@ self-hosted-runner: - windows.g5.4xlarge.nvidia.gpu # Windows ARM64 runners - windows-11-arm64 +<<<<<<< HEAD # Organization-wide AMD hosted runners - linux.rocm.gpu - linux.rocm.gpu.2 - linux.rocm.gpu.4 +======= + - windows-11-arm64-preview + # Organization-wide AMD-hosted runners + # MI2xx runners + - linux.rocm.gpu + - linux.rocm.gpu.mi250 + - linux.rocm.gpu.2 + - linux.rocm.gpu.4 + # MI300 runners + - linux.rocm.gpu.mi300.2 + - linux.rocm.gpu.mi300.4 + - rocm-docker +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # Repo-specific Apple hosted runners - macos-m1-ultra - macos-m2-14 diff --git a/.github/actions/binary-docker-build/action.yml b/.github/actions/binary-docker-build/action.yml new file mode 100644 index 000000000000..bc6e2b2196d1 --- /dev/null +++ b/.github/actions/binary-docker-build/action.yml @@ -0,0 +1,70 @@ +name: Binary docker build + +description: Build docker image for binary builds + +inputs: + docker-image-name: + description: Docker image name for PR builds + required: true + docker-build-dir: + description: Location of the build.sh relative to .ci/docker + required: true + custom-tag-prefix: + description: Custom tag prefix for the docker image + required: false + DOCKER_TOKEN: + description: Docker token for authentication + required: true + DOCKER_ID: + description: Docker ID for authentication + required: true + +runs: + using: composite + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@main + + - name: Calculate docker image + id: calculate-docker-image + uses: pytorch/test-infra/.github/actions/calculate-docker-image@main + with: + docker-image-name: ${{ inputs.docker-image-name }} + docker-build-dir: .ci/docker + custom-tag-prefix: ${{ inputs.custom-tag-prefix }} + docker-build-script: ${{ inputs.docker-build-dir }}/build.sh + always-rebuild: true + push: true + + - name: Tag and (if WITH_PUSH) push docker image to docker.io + env: + DOCKER_TOKEN: ${{ inputs.DOCKER_TOKEN }} + DOCKER_ID: ${{ inputs.DOCKER_ID }} + DOCKER_IMAGE_NAME: ${{ inputs.docker-image-name }} + DOCKER_IMAGE_PREFIX: ${{ inputs.custom-tag-prefix }} + CREATED_FULL_DOCKER_IMAGE_NAME: ${{ steps.calculate-docker-image.outputs.docker-image }} + shell: bash + run: | + set -euox pipefail + GITHUB_REF=${GITHUB_REF:-$(git symbolic-ref -q HEAD || git describe --tags --exact-match)} + GIT_BRANCH_NAME=${GITHUB_REF##*/} + GIT_COMMIT_SHA=${GITHUB_SHA:-$(git rev-parse HEAD)} + CI_FOLDER_SHA=$(git rev-parse HEAD:.ci/docker) + + DOCKER_IMAGE_NAME_PREFIX=docker.io/pytorch/${DOCKER_IMAGE_NAME}:${DOCKER_IMAGE_PREFIX} + + docker tag ${CREATED_FULL_DOCKER_IMAGE_NAME} ${DOCKER_IMAGE_NAME_PREFIX} + docker tag ${CREATED_FULL_DOCKER_IMAGE_NAME} ${DOCKER_IMAGE_NAME_PREFIX}-${GIT_BRANCH_NAME} + docker tag ${CREATED_FULL_DOCKER_IMAGE_NAME} ${DOCKER_IMAGE_NAME_PREFIX}-${GIT_COMMIT_SHA} + docker tag ${CREATED_FULL_DOCKER_IMAGE_NAME} ${DOCKER_IMAGE_NAME_PREFIX}-${CI_FOLDER_SHA} + + # Pretty sure Github will mask tokens and I'm not sure if it will even be + # printed due to pipe, but just in case + set +x + if [[ ${WITH_PUSH:-false} == "true" ]]; then + echo "${DOCKER_TOKEN}" | docker login -u "${DOCKER_ID}" --password-stdin + docker push ${DOCKER_IMAGE_NAME_PREFIX} + docker push ${DOCKER_IMAGE_NAME_PREFIX}-${GIT_BRANCH_NAME} + docker push ${DOCKER_IMAGE_NAME_PREFIX}-${GIT_COMMIT_SHA} + docker push ${DOCKER_IMAGE_NAME_PREFIX}-${CI_FOLDER_SHA} + fi diff --git a/.github/actions/build-android/action.yml b/.github/actions/build-android/action.yml index 1d4d71fd9d36..f66f4988e29a 100644 --- a/.github/actions/build-android/action.yml +++ b/.github/actions/build-android/action.yml @@ -9,7 +9,11 @@ inputs: arch-for-build-env: description: | arch to pass to build environment. +<<<<<<< HEAD This is currently different than the arch name we use elswhere, which +======= + This is currently different than the arch name we use elsewhere, which +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) should be fixed. required: true github-secret: diff --git a/.github/actions/checkout-pytorch/action.yml b/.github/actions/checkout-pytorch/action.yml index 7908e9a12c02..4e2fdb634173 100644 --- a/.github/actions/checkout-pytorch/action.yml +++ b/.github/actions/checkout-pytorch/action.yml @@ -23,12 +23,54 @@ runs: id: check_container_runner run: echo "IN_CONTAINER_RUNNER=$(if [ -f /.inarc ] || [ -f /.incontainer ]; then echo true ; else echo false; fi)" >> "$GITHUB_OUTPUT" +<<<<<<< HEAD - name: Clean workspace +======= + - name: Set up parallel fetch and clean workspace + id: first-clean + continue-on-error: true +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) shell: bash if: ${{ steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false' }} env: NO_SUDO: ${{ inputs.no-sudo }} run: | +<<<<<<< HEAD +======= + # Use all available CPUs for fetching + cd "${GITHUB_WORKSPACE}" + git config --global fetch.parallel 0 + git config --global submodule.fetchJobs 0 + + # Clean workspace. The default checkout action should also do this, but + # do it here as well just in case + if [[ -d .git ]]; then + if [ -z "${NO_SUDO}" ]; then + sudo git clean -ffdx + else + git clean -ffdx + fi + fi + + - name: Checkout PyTorch + id: first-checkout-attempt + continue-on-error: true + uses: actions/checkout@v4 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + # --depth=1 for speed, manually fetch history and other refs as necessary + fetch-depth: ${{ inputs.fetch-depth }} + submodules: ${{ inputs.submodules }} + show-progress: false + + - name: Clean workspace (try again) + if: ${{ steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false' && + (steps.first-clean.outcome != 'success' || steps.first-checkout-attempt.outcome != 'success') }} + shell: bash + env: + NO_SUDO: ${{ inputs.no-sudo }} + run: | +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) retry () { $* || (sleep 1 && $*) || (sleep 2 && $*) || (sleep 4 && $*) || (sleep 8 && $*) } @@ -40,6 +82,7 @@ runs: fi mkdir "${GITHUB_WORKSPACE}" +<<<<<<< HEAD # Use all available CPUs for fetching cd "${GITHUB_WORKSPACE}" git config --global fetch.parallel 0 @@ -50,6 +93,13 @@ runs: with: ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} # --depth=1 for speed, manually fetch history and other refs as necessary +======= + - name: Checkout PyTorch (try again) + uses: actions/checkout@v4 + if: ${{ steps.first-clean.outcome != 'success' || steps.first-checkout-attempt.outcome != 'success' }} + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) fetch-depth: ${{ inputs.fetch-depth }} submodules: ${{ inputs.submodules }} show-progress: false diff --git a/.github/actions/filter-test-configs/action.yml b/.github/actions/filter-test-configs/action.yml index 7da1ce3fe071..493bb0f15b56 100644 --- a/.github/actions/filter-test-configs/action.yml +++ b/.github/actions/filter-test-configs/action.yml @@ -157,4 +157,8 @@ runs: echo "Is keep-going label set? ${{ steps.filter.outputs.keep-going }}" echo +<<<<<<< HEAD echo "Renabled issues? ${{ steps.filter.outputs.reenabled-issues }}" +======= + echo "Reenabled issues? ${{ steps.filter.outputs.reenabled-issues }}" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) diff --git a/.github/actions/linux-test/action.yml b/.github/actions/linux-test/action.yml index 0b031046a7a5..7edcba38c776 100644 --- a/.github/actions/linux-test/action.yml +++ b/.github/actions/linux-test/action.yml @@ -66,7 +66,11 @@ runs: - name: configure aws credentials if : ${{ inputs.aws-role-to-assume != '' }} +<<<<<<< HEAD uses: aws-actions/configure-aws-credentials@v3 +======= + uses: aws-actions/configure-aws-credentials@v4 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: role-to-assume: ${{ inputs.aws-role-to-assume }} role-session-name: gha-linux-test @@ -153,7 +157,11 @@ runs: github-token: ${{ inputs.GITHUB_TOKEN }} - name: Check for keep-going label and re-enabled test issues +<<<<<<< HEAD # This uses the filter-test-configs action because it conviniently +======= + # This uses the filter-test-configs action because it conveniently +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # checks for labels and re-enabled test issues. It does not actually do # any filtering. All filtering is done in the build step. id: keep-going diff --git a/.github/actions/reuse-old-whl/action.yml b/.github/actions/reuse-old-whl/action.yml new file mode 100644 index 000000000000..1976a30828ed --- /dev/null +++ b/.github/actions/reuse-old-whl/action.yml @@ -0,0 +1,47 @@ +name: Reuse old wheel if possible + +description: + Reuse old wheel if possible + +inputs: + build-environment: + description: Build environment + required: true + run-id: + description: Workflow run ID + required: true + github-token: + description: GitHub token + required: true + job-id: + description: Job ID + required: true + job-name: + description: Job name + required: true + +outputs: + reuse: + description: Whether the wheel is reused or not + value: ${{ steps.check-file-changes.outputs.reuse }} + +runs: + using: composite + + steps: + # Check out pytorch with fetch depth 0 + - name: Check file changes + id: check-file-changes + shell: bash + continue-on-error: true + env: + GITHUB_TOKEN: ${{ inputs.github-token }} + JOB_ID: ${{ inputs.job-id }} + JOB_NAME: ${{ inputs.job-name }} + run: | + set -x + python3 -m pip install boto3==1.35.42 + python3 ${GITHUB_ACTION_PATH}/reuse_old_whl.py \ + --build-environment "${{ inputs.build-environment }}" \ + --run-id "${{ inputs.run-id }}" \ + --github-ref "${{ github.ref }}" diff --git a/.github/actions/reuse-old-whl/reuse_old_whl.py b/.github/actions/reuse-old-whl/reuse_old_whl.py new file mode 100644 index 000000000000..c4756f4a2f4c --- /dev/null +++ b/.github/actions/reuse-old-whl/reuse_old_whl.py @@ -0,0 +1,385 @@ +import argparse +import os +import subprocess +import sys +from functools import lru_cache +from pathlib import Path +from typing import Any, cast, Optional, Union + +import requests + + +REPO_ROOT = Path(__file__).resolve().parent.parent.parent.parent +sys.path.insert(0, str(REPO_ROOT)) +from tools.stats.upload_metrics import emit_metric + + +sys.path.remove(str(REPO_ROOT)) # Clean up sys.path after import + + +FORCE_REBUILD_LABEL = "ci-force-rebuild" + + +@lru_cache +def get_merge_base() -> str: + merge_base = subprocess.check_output( + ["git", "merge-base", "HEAD", "origin/main"], + text=True, + stderr=subprocess.DEVNULL, + ).strip() + # Remove this when we turn this off for the main branch + if merge_base == get_head_sha(): + print("Merge base is the same as HEAD, using HEAD^") + merge_base = subprocess.check_output( + ["git", "rev-parse", "HEAD^"], + text=True, + stderr=subprocess.DEVNULL, + ).strip() + print(f"Merge base: {merge_base}") + return merge_base + + +@lru_cache +def get_head_sha() -> str: + sha = subprocess.check_output( + ["git", "rev-parse", "HEAD"], + text=True, + stderr=subprocess.DEVNULL, + ).strip() + return sha + + +def is_main_branch() -> bool: + return False + # Testing on main branch for now + # print( + # f"Checking if we are on main branch: merge base {get_merge_base()}, head {get_head_sha()}" + # ) + # return get_merge_base() == get_head_sha() + + +def query_github_api(url: str) -> Any: + headers = { + "Accept": "application/vnd.github.v3+json", + "Authorization": f"Bearer {os.environ['GITHUB_TOKEN']}", + } + response = requests.get(url, headers=headers) + return response.json() + + +@lru_cache +def check_labels_for_pr() -> bool: + # Check if the current commit is part of a PR and if it has the + # FORCE_REBUILD_LABEL + head_sha = get_head_sha() + url = f"https://api.github.com/repos/pytorch/pytorch/commits/{head_sha}/pulls" + response = query_github_api(url) + + print( + f"Found {len(response)} PRs for commit {head_sha}: {[pr['number'] for pr in response]}" + ) + for pr in response: + labels = pr.get("labels", []) + for label in labels: + if label["name"] == FORCE_REBUILD_LABEL: + print(f"Found label {FORCE_REBUILD_LABEL} in PR {pr['number']}.") + return True + return False + + +def check_issue_open() -> bool: + # Check if issue #153759 is open. This is the config issue for quickly + # forcing everyone to build + url = "https://api.github.com/repos/pytorch/pytorch/issues/153759" + response = query_github_api(url) + if response.get("state") == "open": + print("Issue #153759 is open.") + return True + else: + print("Issue #153759 is not open.") + return False + + +def get_workflow_id(run_id: str) -> Optional[str]: + # Get the workflow ID that corresponds to the file for the run ID + url = f"https://api.github.com/repos/pytorch/pytorch/actions/runs/{run_id}" + response = query_github_api(url) + if "workflow_id" in response: + print(f"Found workflow ID for run ID {run_id}: {response['workflow_id']}") + return cast(str, response["workflow_id"]) + else: + print("No workflow ID found.") + return None + + +def ok_changed_file(file: str) -> bool: + # Return true if the file is in the list of allowed files to be changed to + # reuse the old whl + if ( + file.startswith("torch/") + and file.endswith(".py") + and not file.startswith("torch/csrc/") + ): + return True + if file.startswith("test/") and file.endswith(".py"): + return True + if file.startswith("docs/") and file.endswith((".md", ".rst")): + return True + return False + + +def check_changed_files(sha: str) -> bool: + # Return true if all the changed files are in the list of allowed files to + # be changed to reuse the old whl + + # Removing files in the torch folder is not allowed since rsync will not + # remove files + removed_files = ( + subprocess.check_output( + [ + "git", + "diff", + "--name-only", + sha, + "HEAD", + "--diff-filter=D", + "--no-renames", + ], + text=True, + stderr=subprocess.DEVNULL, + ) + .strip() + .split() + ) + if any(file.startswith("torch/") for file in removed_files): + print( + f"Removed files between {sha} and HEAD: {removed_files}, cannot reuse old whl" + ) + return False + + changed_files = ( + subprocess.check_output( + ["git", "diff", "--name-only", sha, "HEAD", "--no-renames"], + text=True, + stderr=subprocess.DEVNULL, + ) + .strip() + .split() + ) + print(f"Checking changed files between {sha} and HEAD:") + for file in changed_files: + if not ok_changed_file(file): + print(f" File {file} is not allowed to be changed.") + return False + else: + print(f" File {file} is allowed to be changed.") + return True + + +def find_old_whl(workflow_id: str, build_environment: str, sha: str) -> bool: + # Find the old whl on s3 and download it to artifacts.zip + if build_environment is None: + print("BUILD_ENVIRONMENT is not set.") + return False + print(f"SHA: {sha}, workflow_id: {workflow_id}") + + workflow_runs = query_github_api( + f"https://api.github.com/repos/pytorch/pytorch/actions/workflows/{workflow_id}/runs?head_sha={sha}&branch=main&per_page=100" + ) + if workflow_runs.get("total_count", 0) == 0: + print("No workflow runs found.") + return False + for run in workflow_runs.get("workflow_runs", []): + # Look in s3 for the old whl + run_id = run["id"] + try: + url = f"https://gha-artifacts.s3.amazonaws.com/pytorch/pytorch/{run_id}/{build_environment}/artifacts.zip" + print(f"Checking for old whl at {url}") + response = requests.get( + url, + ) + if response.status_code == 200: + with open("artifacts.zip", "wb") as f: + f.write(response.content) + print(f"Found old whl file from s3: {url}") + return True + except requests.RequestException as e: + print(f"Error checking for old whl: {e}") + continue + return False + + +def unzip_artifact_and_replace_files() -> None: + # Unzip the artifact and replace files + subprocess.check_output( + ["unzip", "-o", "artifacts.zip", "-d", "artifacts"], + ) + os.remove("artifacts.zip") + + head_sha = get_head_sha() + + # Rename wheel into zip + wheel_path = Path("artifacts/dist").glob("*.whl") + for path in wheel_path: + # Should be of the form torch-2.0.0+git1234567-cp37-etc.whl + # Should usually be the merge base sha but for the ones that didn't do + # the replacement, it won't be. Can probably change it to just be merge + # base later + old_version = f"+git{path.stem.split('+')[1].split('-')[0][3:]}" + new_version = f"+git{head_sha[:7]}" + + def rename_to_new_version(file: Union[str, Path]) -> None: + # Rename file with old_version to new_version + subprocess.check_output( + ["mv", file, str(file).replace(old_version, new_version)] + ) + + def change_content_to_new_version(file: Union[str, Path]) -> None: + # Check if is a file + if os.path.isdir(file): + return + # Replace the old version in the file with the new version + with open(file) as f: + content = f.read() + content = content.replace(old_version, new_version) + with open(file, "w") as f: + f.write(content) + + zip_path = path.with_suffix(".zip") + os.rename(path, zip_path) + old_stem = zip_path.stem + # Unzip the wheel + subprocess.check_output( + ["unzip", "-o", zip_path, "-d", f"artifacts/dist/{old_stem}"], + ) + + # Remove the old wheel (which is now a zip file) + os.remove(zip_path) + + # Copy python files into the artifact + subprocess.check_output( + ["rsync", "-avz", "torch", f"artifacts/dist/{old_stem}"], + ) + + change_content_to_new_version(f"artifacts/dist/{old_stem}/torch/version.py") + + for file in Path(f"artifacts/dist/{old_stem}").glob( + "*.dist-info/**", + ): + change_content_to_new_version(file) + + rename_to_new_version(f"artifacts/dist/{old_stem}") + new_stem = old_stem.replace(old_version, new_version) + + for file in Path(f"artifacts/dist/{new_stem}").glob( + "*.dist-info", + ): + rename_to_new_version(file) + + # Zip the wheel back + subprocess.check_output( + ["zip", "-r", f"{new_stem}.zip", "."], + cwd=f"artifacts/dist/{new_stem}", + ) + + subprocess.check_output( + [ + "mv", + f"artifacts/dist/{new_stem}/{new_stem}.zip", + f"artifacts/dist/{new_stem}.whl", + ], + ) + + # Remove the extracted folder + subprocess.check_output( + ["rm", "-rf", f"artifacts/dist/{new_stem}"], + ) + + # Rezip the artifact + subprocess.check_output(["zip", "-r", "artifacts.zip", "."], cwd="artifacts") + subprocess.check_output( + ["mv", "artifacts/artifacts.zip", "."], + ) + return None + + +def set_output() -> None: + # Disable for now so we can monitor first + # pass + if os.getenv("GITHUB_OUTPUT"): + with open(str(os.getenv("GITHUB_OUTPUT")), "a") as env: + print("reuse=true", file=env) + else: + print("::set-output name=reuse::true") + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description="Check for old whl files.") + parser.add_argument("--run-id", type=str, required=True, help="Workflow ID") + parser.add_argument( + "--build-environment", type=str, required=True, help="Build environment" + ) + parser.add_argument( + "--github-ref", + type=str, + ) + return parser.parse_args() + + +def can_reuse_whl(args: argparse.Namespace) -> tuple[bool, str]: + if args.github_ref and any( + args.github_ref.startswith(x) + for x in [ + "refs/heads/release", + "refs/tags/v", + "refs/heads/nightly", + ] + ): + print("Release branch, rebuild whl") + return (False, "Release branch") + + if not check_changed_files(get_merge_base()): + print("Cannot use old whl due to the changed files, rebuild whl") + return (False, "Changed files not allowed") + + if check_labels_for_pr(): + print(f"Found {FORCE_REBUILD_LABEL} label on PR, rebuild whl") + return (False, "Found FORCE_REBUILD_LABEL on PR") + + if check_issue_open(): + print("Issue #153759 is open, rebuild whl") + return (False, "Issue #153759 is open") + + workflow_id = get_workflow_id(args.run_id) + if workflow_id is None: + print("No workflow ID found, rebuild whl") + return (False, "No workflow ID found") + + if not find_old_whl(workflow_id, args.build_environment, get_merge_base()): + print("No old whl found, rebuild whl") + return (False, "No old whl found") + # TODO: go backwards from merge base to find more runs + + return (True, "Found old whl") + + +if __name__ == "__main__": + args = parse_args() + + reuse_whl, reason = can_reuse_whl(args) + + if reuse_whl: + print("Reusing old whl") + unzip_artifact_and_replace_files() + set_output() + + emit_metric( + "reuse_old_whl", + { + "reuse_whl": reuse_whl, + "reason": reason, + "build_environment": args.build_environment, + "merge_base": get_merge_base(), + "head_sha": get_head_sha(), + }, + ) diff --git a/.github/actions/setup-linux/action.yml b/.github/actions/setup-linux/action.yml index da514c04a69f..ee70cb47dc97 100644 --- a/.github/actions/setup-linux/action.yml +++ b/.github/actions/setup-linux/action.yml @@ -33,14 +33,22 @@ runs: id: check_container_runner run: echo "IN_CONTAINER_RUNNER=$(if [ -f /.inarc ] || [ -f /.incontainer ]; then echo true ; else echo false; fi)" >> "$GITHUB_OUTPUT" +<<<<<<< HEAD - name: Start docker if docker deamon is not running +======= + - name: Start docker if docker daemon is not running +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) shell: bash if: ${{ steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false' }} run: | if systemctl is-active --quiet docker; then echo "Docker daemon is running..."; else +<<<<<<< HEAD echo "Starting docker deamon..." && sudo systemctl start docker; +======= + echo "Starting docker daemon..." && sudo systemctl start docker; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) fi - name: Log in to ECR diff --git a/.github/actions/setup-rocm/action.yml b/.github/actions/setup-rocm/action.yml index 0982df529dd4..1d5772bc7e70 100644 --- a/.github/actions/setup-rocm/action.yml +++ b/.github/actions/setup-rocm/action.yml @@ -5,6 +5,15 @@ description: Set up ROCm host for CI runs: using: composite steps: +<<<<<<< HEAD +======= + - name: Runner ROCm version + if: always() + shell: bash + run: | + dpkg -l | grep -E " rocm" + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: Stop all running docker containers if: always() shell: bash diff --git a/.github/actions/setup-xpu/action.yml b/.github/actions/setup-xpu/action.yml index 50411e4bdf33..139f82b09fc1 100644 --- a/.github/actions/setup-xpu/action.yml +++ b/.github/actions/setup-xpu/action.yml @@ -29,13 +29,21 @@ runs: if: always() shell: bash run: | +<<<<<<< HEAD xpu-smi discovery +======= + timeout 30 xpu-smi discovery || true +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: Runner health check GPU count if: always() shell: bash run: | +<<<<<<< HEAD ngpu=$(xpu-smi discovery | grep -c -E 'Device Name') +======= + ngpu=$(timeout 30 xpu-smi discovery | grep -c -E 'Device Name' || true) +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) msg="Please file an issue on pytorch/pytorch reporting the faulty runner. Include a link to the runner logs so the runner can be identified" if [[ $ngpu -eq 0 ]]; then echo "Error: Failed to detect any GPUs on the runner" diff --git a/.github/actions/test-pytorch-binary/action.yml b/.github/actions/test-pytorch-binary/action.yml index 51fc8d14f474..03b9e69bf72d 100644 --- a/.github/actions/test-pytorch-binary/action.yml +++ b/.github/actions/test-pytorch-binary/action.yml @@ -15,7 +15,10 @@ runs: -e BINARY_ENV_FILE \ -e BUILD_ENVIRONMENT \ -e DESIRED_CUDA \ +<<<<<<< HEAD -e DESIRED_DEVTOOLSET \ +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) -e DESIRED_PYTHON \ -e GITHUB_ACTIONS \ -e GPU_ARCH_TYPE \ diff --git a/.github/actions/upload-sccache-stats/action.yml b/.github/actions/upload-sccache-stats/action.yml index 1561a72ee786..8e30d056d5db 100644 --- a/.github/actions/upload-sccache-stats/action.yml +++ b/.github/actions/upload-sccache-stats/action.yml @@ -22,6 +22,7 @@ runs: retention-days: 14 if-no-files-found: warn path: sccache-stats-*.json +<<<<<<< HEAD - name: Format sccache stats shell: bash @@ -37,3 +38,5 @@ runs: dry-run: false schema-version: v3 github-token: ${{ inputs.github-token }} +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) diff --git a/.github/actions/upload-test-artifacts/action.yml b/.github/actions/upload-test-artifacts/action.yml index 76b0e5533ce6..6cc23ff7f98c 100644 --- a/.github/actions/upload-test-artifacts/action.yml +++ b/.github/actions/upload-test-artifacts/action.yml @@ -48,6 +48,7 @@ runs: run: | # Remove any previous usage logs if they exist rm -f logs-*.zip +<<<<<<< HEAD # this workflow is also run in bazel build test, but we dont generate usage reports for it # so check to see if the file exists first if [ -f 'usage_log.txt' ]; then @@ -56,6 +57,10 @@ runs: if find "test/test-reports" -name "*.log" 2>/dev/null | grep -q .; then zip -r "logs-${FILE_SUFFIX}.zip" test/test-reports -i '*.log' fi +======= + zip "logs-${FILE_SUFFIX}.zip" 'usage_log.txt' || true + zip -r "logs-${FILE_SUFFIX}.zip" test/test-reports -i '*.log' || true +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: Zip debugging artifacts for upload if: runner.os != 'Windows' && !inputs.use-gha diff --git a/.github/actions/upload-utilization-stats/action.yml b/.github/actions/upload-utilization-stats/action.yml index 662a95330bb2..40fbb97b6a12 100644 --- a/.github/actions/upload-utilization-stats/action.yml +++ b/.github/actions/upload-utilization-stats/action.yml @@ -1,6 +1,10 @@ name: upload-utilization-stats +<<<<<<< HEAD description: Upload utilization stats to artifacts +======= +description: Upload utilization stats to artifacts. +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) inputs: workflow_run_id: @@ -23,6 +27,20 @@ inputs: type: string description: 'the job name of the test' required: True +<<<<<<< HEAD +======= + local_path: + type: string + description: 'the local path to the utilization stats file' + required: False + default: '' + artifact_prefix: + type: string + description: | + 'the prefix of the raw utilization data, for data stored in zip file, this is the prefix of the parent zip file' + default: "" + required: False +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) runs: using: composite @@ -35,6 +53,11 @@ runs: echo "workflow_Name: ${{inputs.workflow_name}}" echo "job_id: ${{inputs.job_id}}" echo "job_name: ${{inputs.job_name}}" +<<<<<<< HEAD +======= + echo "artifact_prefix: ${{inputs.artifact_prefix}}" + python3 --version +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - uses: nick-fields/retry@v3.0.0 name: Setup dependencies with: @@ -44,7 +67,11 @@ runs: retry_wait_seconds: 30 command: | set -eu +<<<<<<< HEAD python3 -m pip install python-dateutil==2.8.2 boto3==1.35.42 pandas==2.1.3 +======= + python3 -m pip install python-dateutil==2.8.2 boto3==1.35.42 pandas==2.1.3 dataclasses_json==0.6.7 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: Upload utilizatoin stats to s3 shell: bash run: | @@ -53,4 +80,10 @@ runs: --workflow-name "${{inputs.workflow_name}}" \ --workflow-run-attempt "${{inputs.workflow_attempt}}" \ --job-id "${{inputs.job_id}}" \ +<<<<<<< HEAD --job-name "${{inputs.job_name}}" +======= + --job-name "${{inputs.job_name}}" \ + --local-path "${{inputs.local_path}}" \ + --artifact-prefix "${{inputs.artifact_prefix}}" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) diff --git a/.github/ci_commit_pins/audio.txt b/.github/ci_commit_pins/audio.txt index f0b99d5801e4..9b47f5a0cf38 100644 --- a/.github/ci_commit_pins/audio.txt +++ b/.github/ci_commit_pins/audio.txt @@ -1 +1,5 @@ +<<<<<<< HEAD c670ad81fda266b6598aeeef434583eb98197ae8 +======= +4e94321c54617dd738a05bfedfc28bc0fa635b5c +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) diff --git a/.github/ci_commit_pins/torchbench.txt b/.github/ci_commit_pins/torchbench.txt index 7e5c1c641e94..39e56b06da60 100644 --- a/.github/ci_commit_pins/torchbench.txt +++ b/.github/ci_commit_pins/torchbench.txt @@ -1 +1,5 @@ +<<<<<<< HEAD 373ffb19dc470f4423a3176a4133f8f4b3cdb5bd +======= +e03a63be43e33596f7f0a43b0f530353785e4a59 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) diff --git a/.github/ci_commit_pins/vision.txt b/.github/ci_commit_pins/vision.txt index c642e5d08c80..a3ad9654d8be 100644 --- a/.github/ci_commit_pins/vision.txt +++ b/.github/ci_commit_pins/vision.txt @@ -1 +1,5 @@ +<<<<<<< HEAD d23a6e1664d20707c11781299611436e1f0c104f +======= +966da7e46f65d6d49df3e31214470a4fe5cc8e66 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) diff --git a/.github/ci_commit_pins/xla.txt b/.github/ci_commit_pins/xla.txt index 110dab1a870d..52f75199b046 100644 --- a/.github/ci_commit_pins/xla.txt +++ b/.github/ci_commit_pins/xla.txt @@ -1 +1,5 @@ +<<<<<<< HEAD r2.7 +======= +r2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) diff --git a/.github/label_to_label.yml b/.github/label_to_label.yml index 5d6544a2f50f..ddaaf1517f2e 100644 --- a/.github/label_to_label.yml +++ b/.github/label_to_label.yml @@ -42,7 +42,11 @@ - "module: aotinductor" - "module: cudagraphs" - "oncall: export" +<<<<<<< HEAD - "module: startup-tracing-compile" +======= + - "module: compile-time" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - "module: compiled autograd" - "module: flex attention" - "module: dynamic shapes" diff --git a/.github/labeler.yml b/.github/labeler.yml index 5bf481fd6f34..dc25850ea932 100644 --- a/.github/labeler.yml +++ b/.github/labeler.yml @@ -112,3 +112,24 @@ - torch/csrc/inductor/aoti_include/xpu.h - torch/csrc/inductor/cpp_wrapper/device_internal/xpu.h - torch/csrc/inductor/cpp_wrapper/xpu.h +<<<<<<< HEAD +======= + +"release notes: inductor (aoti)": +- torch/_C/_aoti.pyi +- torch/_dynamo/repro/aoti.py +- torch/_higher_order_ops/aoti_call_delegate.py +- torch/_inductor/codegen/aoti_runtime/** +- torch/_inductor/codegen/aoti_hipify_utils.py +- torch/_inductor/codegen/cpp_wrapper_cpu.py +- torch/_inductor/codegen/cpp_wrapper_gpu.py +- torch/_inductor/aoti_eager.py +- torch/csrc/inductor/aoti_runtime/** +- torch/csrc/inductor/aoti_torch/** +- torch/csrc/inductor/aoti_runner/** +- torch/csrc/inductor/aoti_eager/** +- torch/csrc/inductor/aoti_package/** +- torch/csrc/inductor/aoti_include/** +- torchgen/aoti/** +- torchgen/gen_aoti_c_shim.py +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) diff --git a/.github/merge_rules.yaml b/.github/merge_rules.yaml index f4b0dc127aa7..74dc75ae1733 100644 --- a/.github/merge_rules.yaml +++ b/.github/merge_rules.yaml @@ -123,6 +123,11 @@ - torch/*docs.py approved_by: - svekars +<<<<<<< HEAD +======= + - sekyondaMeta + - AlannaBurke +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) mandatory_checks_name: - EasyCLA - Lint @@ -393,19 +398,34 @@ - torch/_inductor/mkldnn_lowerings.py - torch/_inductor/fx_passes/mkldnn_fusion.py - torch/_inductor/fx_passes/quantization.py +<<<<<<< HEAD - torch/_inductor/codegen/cpp_prefix.h +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - torch/_inductor/codegen/cpp.py - torch/_inductor/codegen/cpp_utils.py - torch/_inductor/codegen/cpp_micro_gemm.py - torch/_inductor/codegen/cpp_template_kernel.py - torch/_inductor/codegen/cpp_template.py +<<<<<<< HEAD - torch/_inductor/codegen/cpp_gemm_template.py +======= + - torch/_inductor/codegen/cpp_bmm_template.py + - torch/_inductor/codegen/cpp_gemm_template.py + - torch/_inductor/codegen/cpp_grouped_gemm_template.py + - torch/_inductor/codegen/cpp_flex_attention_template.py + - torch/csrc/inductor/cpp_prefix.h +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - test/inductor/test_mkldnn_pattern_matcher.py - test/inductor/test_cpu_repro.py - test/inductor/test_cpu_cpp_wrapper.py - test/inductor/test_cpu_select_algorithm.py - aten/src/ATen/cpu/** - aten/src/ATen/native/quantized/cpu/** +<<<<<<< HEAD +======= + - aten/src/ATen/test/vec_test_all_types.* +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - test/quantization/core/test_quantized_op.py - torch/ao/quantization/quantizer/x86_inductor_quantizer.py - test/quantization/pt2e/test_x86inductor_quantizer.py @@ -413,6 +433,10 @@ - leslie-fang-intel - jgong5 - EikanWang +<<<<<<< HEAD +======= + - CaoE +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) mandatory_checks_name: - EasyCLA - Lint @@ -501,7 +525,13 @@ - name: XPU patterns: - '**xpu**' +<<<<<<< HEAD + - '**sycl**' +======= + - '**XPU**' - '**sycl**' + - '**SYCL**' +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) approved_by: - EikanWang - jgong5 @@ -538,6 +568,10 @@ - bdhirsh - zou3519 - isuruf +<<<<<<< HEAD +======= + - Chillee +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) mandatory_checks_name: - EasyCLA - Lint diff --git a/.github/pytorch-probot.yml b/.github/pytorch-probot.yml index ccb71e6a9bf0..e22b47feb618 100644 --- a/.github/pytorch-probot.yml +++ b/.github/pytorch-probot.yml @@ -11,11 +11,19 @@ ciflow_push_tags: - ciflow/inductor-perf-compare - ciflow/inductor-micro-benchmark - ciflow/inductor-micro-benchmark-cpu-x86 +<<<<<<< HEAD +======= +- ciflow/inductor-perf-test-nightly-x86-zen +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - ciflow/inductor-cu126 - ciflow/linux-aarch64 - ciflow/mps - ciflow/nightly - ciflow/periodic +<<<<<<< HEAD +======= +- ciflow/periodic-rocm-mi300 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - ciflow/rocm - ciflow/rocm-mi300 - ciflow/s390 @@ -24,7 +32,14 @@ ciflow_push_tags: - ciflow/unstable - ciflow/xpu - ciflow/torchbench +<<<<<<< HEAD - ciflow/autoformat +======= +- ciflow/op-benchmark +- ciflow/pull +- ciflow/h100 +- ciflow/h100-distributed +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) retryable_workflows: - pull - trunk diff --git a/.github/requirements-gha-cache.txt b/.github/requirements-gha-cache.txt index caabd1edf200..b0b3c046454b 100644 --- a/.github/requirements-gha-cache.txt +++ b/.github/requirements-gha-cache.txt @@ -10,5 +10,9 @@ lintrunner==0.10.7 ninja==1.10.0.post1 nvidia-ml-py==11.525.84 pyyaml==6.0 +<<<<<<< HEAD requests==2.32.2 +======= +requests==2.32.4 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) rich==10.9.0 diff --git a/.github/requirements/README.md b/.github/requirements/README.md index 102ac4d420f0..b3c9bde0b143 100644 --- a/.github/requirements/README.md +++ b/.github/requirements/README.md @@ -11,6 +11,7 @@ jobs, but it also allows them to be cached properly to improve CI reliability. The list of support files are as follows: +<<<<<<< HEAD * Conda: * conda-env-iOS. This is used by iOS build and test jobs to setup the @@ -22,5 +23,8 @@ The list of support files are as follows: * Pip: * pip-requirements-iOS.txt. This is used by iOS build and test jobs to setup the pip environment +======= +* Pip: +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) * pip-requirements-macOS.txt. This is used by MacOS build and test jobs to setup the pip environment diff --git a/.github/requirements/conda-env-macOS-ARM64 b/.github/requirements/conda-env-macOS-ARM64 index 24ba665883ff..df9701270655 100644 --- a/.github/requirements/conda-env-macOS-ARM64 +++ b/.github/requirements/conda-env-macOS-ARM64 @@ -1,3 +1,4 @@ +<<<<<<< HEAD numpy=1.22.3 pyyaml=6.0 setuptools=72.1.0 @@ -20,3 +21,10 @@ certifi # Cross-compiling arm64 from x86-64 picks up 1.40.0 while testing on arm64 # itself only has up to 1.39.0 from upstream conda. Both work though libuv>=1.39.0,<=1.40.0 +======= +# Not pinning certifi so that we can always get the latest certificates +certifi +pip=23.2.1 +pkg-config=0.29.2 +wheel=0.37.1 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) diff --git a/.github/requirements/pip-requirements-macOS.txt b/.github/requirements/pip-requirements-macOS.txt index 06e0428c883b..13d4ae1e8f3f 100644 --- a/.github/requirements/pip-requirements-macOS.txt +++ b/.github/requirements/pip-requirements-macOS.txt @@ -1,4 +1,5 @@ boto3==1.35.42 +<<<<<<< HEAD hypothesis==6.56.4 expecttest==0.3.0 fbscribelogger==0.1.7 @@ -31,3 +32,40 @@ optree==0.13.0 # which the stringify metadata is wrong when escaping double quote protobuf==3.20.2 parameterized==0.8.1 +======= +cmake==3.27.* +expecttest==0.3.0 +fbscribelogger==0.1.7 +filelock==3.6.0 +hypothesis==6.56.4 +librosa>=0.6.2 +mpmath==1.3.0 +networkx==2.8.7 +ninja==1.10.2.4 +numba==0.59.0 +numpy==1.26.4 +opt-einsum>=3.3 +optree==0.13.0 +packaging==23.1 +parameterized==0.8.1 +pillow==10.3.0 +protobuf==5.29.4 +psutil==5.9.1 +pygments==2.15.0 +pytest-cpp==2.3.0 +pytest-flakefinder==1.1.0 +pytest-rerunfailures==10.3 +pytest-subtests==0.13.1 +pytest-xdist==3.3.1 +pytest==7.3.2 +pyyaml==6.0.2 +scipy==1.12.0 +setuptools==72.1.0 +sympy==1.13.3 +tlparse==0.3.30 +tensorboard==2.13.0 +typing-extensions==4.12.2 +unittest-xml-reporting<=3.2.0,>=2.0.0 +xdoctest==1.1.0 +z3-solver==4.12.2.0 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) diff --git a/.github/scripts/amd/patch_triton_wheel.sh b/.github/scripts/amd/patch_triton_wheel.sh index ac233bdc4318..cbf159d08d42 100755 --- a/.github/scripts/amd/patch_triton_wheel.sh +++ b/.github/scripts/amd/patch_triton_wheel.sh @@ -76,7 +76,11 @@ for pkg in /$WHEELHOUSE_DIR/*triton*.whl; do echo "Copied $filepath to $patchedpath" done +<<<<<<< HEAD # Go through all required shared objects and see if any of our other objects are dependants. If so, replace so.ver wth so +======= + # Go through all required shared objects and see if any of our other objects are dependants. If so, replace so.ver with so +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) for ((i=0;i<${#deps[@]};++i)); do echo "replacing "${deps_soname[i]} ${patched[i]} replace_needed_sofiles $PREFIX/$ROCM_LIB ${deps_soname[i]} ${patched[i]} diff --git a/.github/scripts/build_triton_wheel.py b/.github/scripts/build_triton_wheel.py index 9bf48ff011a2..303fdc451d83 100644 --- a/.github/scripts/build_triton_wheel.py +++ b/.github/scripts/build_triton_wheel.py @@ -22,8 +22,16 @@ def read_triton_pin(device: str = "cuda") -> str: return f.read().strip() +<<<<<<< HEAD def read_triton_version() -> str: with open(REPO_DIR / ".ci" / "docker" / "triton_version.txt") as f: +======= +def read_triton_version(device: str = "cuda") -> str: + triton_version_file = "triton_version.txt" + if device == "xpu": + triton_version_file = "triton_xpu_version.txt" + with open(REPO_DIR / ".ci" / "docker" / triton_version_file) as f: +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return f.read().strip() @@ -95,6 +103,10 @@ def build_triton( with TemporaryDirectory() as tmpdir: triton_basedir = Path(tmpdir) / "triton" triton_pythondir = triton_basedir / "python" +<<<<<<< HEAD +======= + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) triton_repo = "https://github.com/openai/triton" if device == "rocm": triton_repo = "https://github.com/ROCm/triton" @@ -126,7 +138,11 @@ def build_triton( patch_init_py( triton_pythondir / "triton" / "__init__.py", version=f"{version}", +<<<<<<< HEAD expected_version=None, +======= + expected_version=read_triton_version(device), +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ) if device == "rocm": @@ -137,11 +153,27 @@ def build_triton( ) print("ROCm libraries setup for triton installation...") +<<<<<<< HEAD check_call( [sys.executable, "setup.py", "bdist_wheel"], cwd=triton_pythondir, env=env ) whl_path = next(iter((triton_pythondir / "dist").glob("*.whl"))) +======= + # old triton versions have setup.py in the python/ dir, + # new versions have it in the root dir. + triton_setupdir = ( + triton_basedir + if (triton_basedir / "setup.py").exists() + else triton_pythondir + ) + + check_call( + [sys.executable, "setup.py", "bdist_wheel"], cwd=triton_setupdir, env=env + ) + + whl_path = next(iter((triton_setupdir / "dist").glob("*.whl"))) +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) shutil.copy(whl_path, Path.cwd()) if device == "rocm": @@ -164,15 +196,29 @@ def main() -> None: parser.add_argument("--py-version", type=str) parser.add_argument("--commit-hash", type=str) parser.add_argument("--with-clang-ldd", action="store_true") +<<<<<<< HEAD parser.add_argument("--triton-version", type=str, default=read_triton_version()) args = parser.parse_args() +======= + parser.add_argument("--triton-version", type=str, default=None) + args = parser.parse_args() + + triton_version = read_triton_version(args.device) + if args.triton_version: + triton_version = args.triton_version + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) build_triton( device=args.device, commit_hash=( args.commit_hash if args.commit_hash else read_triton_pin(args.device) ), +<<<<<<< HEAD version=args.triton_version, +======= + version=triton_version, +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) py_version=args.py_version, release=args.release, with_clang_ldd=args.with_clang_ldd, diff --git a/.github/scripts/docathon-label-sync.py b/.github/scripts/docathon-label-sync.py index a10c3c3f886c..15640a856611 100644 --- a/.github/scripts/docathon-label-sync.py +++ b/.github/scripts/docathon-label-sync.py @@ -28,12 +28,20 @@ def main() -> None: issue = repo.get_issue(issue_number) issue_labels = issue.labels docathon_label_present = any( +<<<<<<< HEAD label.name == "docathon-h1-2024" for label in issue_labels +======= + label.name == "docathon-h1-2025" for label in issue_labels +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ) # if the issue has a docathon label, add all labels from the issue to the PR. if not docathon_label_present: +<<<<<<< HEAD print("The 'docathon-h1-2024' label is not present in the issue.") +======= + print("The 'docathon-h1-2025' label is not present in the issue.") +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return pull_request_labels = pull_request.get_labels() pull_request_label_names = [label.name for label in pull_request_labels] diff --git a/.github/scripts/filter_test_configs.py b/.github/scripts/filter_test_configs.py index a65e427e8c22..46786f79117f 100755 --- a/.github/scripts/filter_test_configs.py +++ b/.github/scripts/filter_test_configs.py @@ -1,4 +1,8 @@ #!/usr/bin/env python3 +<<<<<<< HEAD +======= +# ruff: noqa: LOG015 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) import json import logging @@ -39,9 +43,15 @@ def is_cuda_or_rocm_job(job_name: Optional[str]) -> bool: } # The link to the published list of disabled jobs +<<<<<<< HEAD DISABLED_JOBS_URL = "https://ossci-metrics.s3.amazonaws.com/disabled-jobs.json?versionId=n.FT07XR3dLMwOLBwmRNquyYSeGk8Het" # and unstable jobs UNSTABLE_JOBS_URL = "https://ossci-metrics.s3.amazonaws.com/unstable-jobs.json?versionId=.Ox7WAXa21I1PVqadHyPfhMRPhl0aCnD" +======= +DISABLED_JOBS_URL = "https://ossci-metrics.s3.amazonaws.com/disabled-jobs.json?versionId=HnkH0xQWnnsoeMsSIVf9291NE5c4jWSa" +# and unstable jobs +UNSTABLE_JOBS_URL = "https://ossci-metrics.s3.amazonaws.com/unstable-jobs.json?versionId=iP_F8gBs60PfOMAJ8gnn1paVrzM1WYsK" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # Some constants used to handle disabled and unstable jobs JOB_NAME_SEP = "/" @@ -79,7 +89,11 @@ def parse_args() -> Any: parser.add_argument( "--job-name", type=str, +<<<<<<< HEAD help="the name of the current job, i.e. linux-focal-py3.8-gcc7 / build", +======= + help="the name of the current job, i.e. linux-jammy-py3.8-gcc7 / build", +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ) parser.add_argument("--pr-number", type=str, help="the pull request number") parser.add_argument("--tag", type=str, help="the associated tag if it exists") diff --git a/.github/scripts/generate_binary_build_matrix.py b/.github/scripts/generate_binary_build_matrix.py index 67f86d878a88..608fd8666c49 100644 --- a/.github/scripts/generate_binary_build_matrix.py +++ b/.github/scripts/generate_binary_build_matrix.py @@ -15,6 +15,7 @@ from typing import Optional +<<<<<<< HEAD # NOTE: Also update the CUDA sources in tools/nightly.py when changing this list CUDA_ARCHES = ["11.8", "12.6", "12.8"] CUDA_STABLE = "12.6" @@ -36,10 +37,32 @@ CPU_CXX11_ABI_ARCH = ["cpu-cxx11-abi"] +======= +# NOTE: Please also update the CUDA sources in `PIP_SOURCES` in tools/nightly.py when changing this +CUDA_ARCHES = ["12.6", "12.8", "12.9"] +CUDA_STABLE = "12.8" +CUDA_ARCHES_FULL_VERSION = { + "12.6": "12.6.3", + "12.8": "12.8.1", + "12.9": "12.9.1", +} +CUDA_ARCHES_CUDNN_VERSION = { + "12.6": "9", + "12.8": "9", + "12.9": "9", +} + +# NOTE: Please also update the ROCm sources in `PIP_SOURCES` in tools/nightly.py when changing this +ROCM_ARCHES = ["6.3", "6.4"] + +XPU_ARCHES = ["xpu"] + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) CPU_AARCH64_ARCH = ["cpu-aarch64"] CPU_S390X_ARCH = ["cpu-s390x"] +<<<<<<< HEAD CUDA_AARCH64_ARCHES = ["12.8-aarch64"] @@ -57,23 +80,39 @@ "nvidia-nccl-cu11==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | " "nvidia-nvtx-cu11==11.8.86; platform_system == 'Linux' and platform_machine == 'x86_64'" ), +======= +CUDA_AARCH64_ARCHES = ["12.9-aarch64"] + + +PYTORCH_EXTRA_INSTALL_REQUIREMENTS = { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) "12.6": ( "nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | " "nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | " "nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | " +<<<<<<< HEAD "nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | " +======= + "nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | " +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) "nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | " "nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | " "nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | " "nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | " "nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | " +<<<<<<< HEAD "nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | " "nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | " +======= + "nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | " + "nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | " +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) "nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | " "nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | " "nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'" ), "12.8": ( +<<<<<<< HEAD "nvidia-cuda-nvrtc-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | " "nvidia-cuda-runtime-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | " "nvidia-cuda-cupti-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | " @@ -101,6 +140,60 @@ "tcmlib==1.2.0 | " "umf==0.9.1 | " "intel-pti==0.10.1" +======= + "nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | " + "nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | " + "nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | " + "nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | " + "nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | " + "nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | " + "nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | " + "nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | " + "nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | " + "nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | " + "nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | " + "nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | " + "nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | " + "nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'" + ), + "12.9": ( + "nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | " + "nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | " + "nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | " + "nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | " + "nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | " + "nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | " + "nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | " + "nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | " + "nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | " + "nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | " + "nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | " + "nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | " + "nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | " + "nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'" + ), + "xpu": ( + "intel-cmplr-lib-rt==2025.1.1 | " + "intel-cmplr-lib-ur==2025.1.1 | " + "intel-cmplr-lic-rt==2025.1.1 | " + "intel-sycl-rt==2025.1.1 | " + "oneccl-devel==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | " + "oneccl==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | " + "impi-rt==2021.15.0; platform_system == 'Linux' and platform_machine == 'x86_64' | " + "onemkl-sycl-blas==2025.1.0 | " + "onemkl-sycl-dft==2025.1.0 | " + "onemkl-sycl-lapack==2025.1.0 | " + "onemkl-sycl-rng==2025.1.0 | " + "onemkl-sycl-sparse==2025.1.0 | " + "dpcpp-cpp-rt==2025.1.1 | " + "intel-opencl-rt==2025.1.1 | " + "mkl==2025.1.0 | " + "intel-openmp==2025.1.1 | " + "tbb==2022.1.0 | " + "tcmlib==1.3.0 | " + "umf==0.10.0 | " + "intel-pti==0.12.3" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ), } @@ -146,8 +239,11 @@ def arch_type(arch_version: str) -> str: return "rocm" elif arch_version in XPU_ARCHES: return "xpu" +<<<<<<< HEAD elif arch_version in CPU_CXX11_ABI_ARCH: return "cpu-cxx11-abi" +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) elif arch_version in CPU_AARCH64_ARCH: return "cpu-aarch64" elif arch_version in CPU_S390X_ARCH: @@ -158,6 +254,7 @@ def arch_type(arch_version: str) -> str: return "cpu" +<<<<<<< HEAD # This can be updated to the release version when cutting release branch, i.e. 2.1 DEFAULT_TAG = os.getenv("RELEASE_VERSION_TAG", "main") @@ -201,6 +298,30 @@ def arch_type(arch_version: str) -> str: for gpu_arch in ROCM_ARCHES }, ("cpu", CXX11_ABI): f"pytorch/libtorch-cxx11-builder:cpu-{DEFAULT_TAG}", +======= +DEFAULT_TAG = os.getenv("RELEASE_VERSION_TAG", "main") + +WHEEL_CONTAINER_IMAGES = { + **{gpu_arch: f"manylinux2_28-builder:cuda{gpu_arch}" for gpu_arch in CUDA_ARCHES}, + **{ + gpu_arch: f"manylinuxaarch64-builder:cuda{gpu_arch.replace('-aarch64', '')}" + for gpu_arch in CUDA_AARCH64_ARCHES + }, + **{gpu_arch: f"manylinux2_28-builder:rocm{gpu_arch}" for gpu_arch in ROCM_ARCHES}, + "xpu": "manylinux2_28-builder:xpu", + "cpu": "manylinux2_28-builder:cpu", + "cpu-aarch64": "manylinux2_28_aarch64-builder:cpu-aarch64", + "cpu-s390x": "pytorch/manylinuxs390x-builder:cpu-s390x", +} + +RELEASE = "release" +DEBUG = "debug" + +LIBTORCH_CONTAINER_IMAGES: dict[str, str] = { + **{gpu_arch: f"libtorch-cxx11-builder:cuda{gpu_arch}" for gpu_arch in CUDA_ARCHES}, + **{gpu_arch: f"libtorch-cxx11-builder:rocm{gpu_arch}" for gpu_arch in ROCM_ARCHES}, + "cpu": "libtorch-cxx11-builder:cpu", +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } FULL_PYTHON_VERSIONS = ["3.9", "3.10", "3.11", "3.12", "3.13", "3.13t"] @@ -210,7 +331,10 @@ def translate_desired_cuda(gpu_arch_type: str, gpu_arch_version: str) -> str: return { "cpu": "cpu", "cpu-aarch64": "cpu", +<<<<<<< HEAD "cpu-cxx11-abi": "cpu-cxx11-abi", +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) "cpu-s390x": "cpu", "cuda": f"cu{gpu_arch_version.replace('.', '')}", "cuda-aarch64": f"cu{gpu_arch_version.replace('-aarch64', '').replace('.', '')}", @@ -225,7 +349,11 @@ def list_without(in_list: list[str], without: list[str]) -> list[str]: def generate_libtorch_matrix( os: str, +<<<<<<< HEAD abi_version: str, +======= + release_type: str, +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) arches: Optional[list[str]] = None, libtorch_variants: Optional[list[str]] = None, ) -> list[dict[str, str]]: @@ -247,9 +375,12 @@ def generate_libtorch_matrix( ret: list[dict[str, str]] = [] for arch_version in arches: for libtorch_variant in libtorch_variants: +<<<<<<< HEAD # one of the values in the following list must be exactly # CXX11_ABI, but the precise value of the other one doesn't # matter +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) gpu_arch_type = arch_type(arch_version) gpu_arch_version = "" if arch_version == "cpu" else arch_version # ROCm builds without-deps failed even in ROCm runners; skip for now @@ -262,6 +393,7 @@ def generate_libtorch_matrix( "desired_cuda": translate_desired_cuda( gpu_arch_type, gpu_arch_version ), +<<<<<<< HEAD "libtorch_variant": libtorch_variant, "libtorch_config": abi_version if os in ("windows", "windows-arm64") @@ -271,11 +403,26 @@ def generate_libtorch_matrix( else "", "container_image": ( LIBTORCH_CONTAINER_IMAGES[(arch_version, abi_version)] +======= + "libtorch_config": release_type, + "libtorch_variant": libtorch_variant, + "container_image": ( + LIBTORCH_CONTAINER_IMAGES[arch_version].split(":")[0] + if os not in ("windows", "windows-arm64") + else "" + ), + "container_image_tag_prefix": ( + LIBTORCH_CONTAINER_IMAGES[arch_version].split(":")[1] +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if os not in ("windows", "windows-arm64") else "" ), "package_type": "libtorch", +<<<<<<< HEAD "build_name": f"libtorch-{gpu_arch_type}{gpu_arch_version}-{libtorch_variant}-{abi_version}".replace( +======= + "build_name": f"libtorch-{gpu_arch_type}{gpu_arch_version}-{libtorch_variant}-{release_type}".replace( +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ".", "_" ), } @@ -301,7 +448,11 @@ def generate_wheels_matrix( # Define default compute archivectures arches = ["cpu"] if os == "linux": +<<<<<<< HEAD arches += CPU_CXX11_ABI_ARCH + CUDA_ARCHES + ROCM_ARCHES + XPU_ARCHES +======= + arches += CUDA_ARCHES + ROCM_ARCHES + XPU_ARCHES +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) elif os == "windows": arches += CUDA_ARCHES + XPU_ARCHES elif os == "linux-aarch64": @@ -320,7 +471,10 @@ def generate_wheels_matrix( gpu_arch_version = ( "" if arch_version == "cpu" +<<<<<<< HEAD or arch_version == "cpu-cxx11-abi" +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) or arch_version == "cpu-aarch64" or arch_version == "cpu-s390x" or arch_version == "xpu" @@ -332,10 +486,17 @@ def generate_wheels_matrix( continue if use_split_build and ( +<<<<<<< HEAD arch_version not in ["12.6", "12.8", "11.8", "cpu"] or os != "linux" ): raise RuntimeError( "Split build is only supported on linux with cuda 12*, 11.8, and cpu.\n" +======= + arch_version not in ["12.6", "12.8", "12.9", "cpu"] or os != "linux" + ): + raise RuntimeError( + "Split build is only supported on linux with cuda 12* and cpu.\n" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) f"Currently attempting to build on arch version {arch_version} and os {os}.\n" "Please modify the matrix generation to exclude this combination." ) @@ -343,7 +504,11 @@ def generate_wheels_matrix( # cuda linux wheels require PYTORCH_EXTRA_INSTALL_REQUIREMENTS to install if ( +<<<<<<< HEAD arch_version in ["12.8", "12.6", "11.8"] +======= + arch_version in ["12.9", "12.8", "12.6"] +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) and os == "linux" or arch_version in CUDA_AARCH64_ARCHES ): @@ -355,8 +520,17 @@ def generate_wheels_matrix( "gpu_arch_version": gpu_arch_version, "desired_cuda": desired_cuda, "use_split_build": "True" if use_split_build else "False", +<<<<<<< HEAD "devtoolset": "cxx11-abi", "container_image": WHEEL_CONTAINER_IMAGES[arch_version], +======= + "container_image": WHEEL_CONTAINER_IMAGES[arch_version].split( + ":" + )[0], + "container_image_tag_prefix": WHEEL_CONTAINER_IMAGES[ + arch_version + ].split(":")[1], +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) "package_type": package_type, "pytorch_extra_install_requirements": ( PYTORCH_EXTRA_INSTALL_REQUIREMENTS[ @@ -384,8 +558,17 @@ def generate_wheels_matrix( gpu_arch_type, gpu_arch_version ), "use_split_build": "True" if use_split_build else "False", +<<<<<<< HEAD "devtoolset": "", "container_image": WHEEL_CONTAINER_IMAGES[arch_version], +======= + "container_image": WHEEL_CONTAINER_IMAGES[ + arch_version + ].split(":")[0], + "container_image_tag_prefix": WHEEL_CONTAINER_IMAGES[ + arch_version + ].split(":")[1], +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) "package_type": package_type, "pytorch_extra_install_requirements": "", "build_name": f"{package_type}-py{python_version}-{gpu_arch_type}{gpu_arch_version}-full".replace( # noqa: B950 @@ -403,6 +586,7 @@ def generate_wheels_matrix( gpu_arch_type, gpu_arch_version ), "use_split_build": "True" if use_split_build else "False", +<<<<<<< HEAD "devtoolset": ( "cxx11-abi" if (arch_version in ["cpu-cxx11-abi", "cpu-aarch64"]) @@ -410,6 +594,14 @@ def generate_wheels_matrix( else "" ), "container_image": WHEEL_CONTAINER_IMAGES[arch_version], +======= + "container_image": WHEEL_CONTAINER_IMAGES[arch_version].split( + ":" + )[0], + "container_image_tag_prefix": WHEEL_CONTAINER_IMAGES[ + arch_version + ].split(":")[1], +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) "package_type": package_type, "build_name": f"{package_type}-py{python_version}-{gpu_arch_type}{gpu_arch_version}".replace( ".", "_" @@ -425,6 +617,12 @@ def generate_wheels_matrix( return ret +<<<<<<< HEAD validate_nccl_dep_consistency("12.8") validate_nccl_dep_consistency("12.6") validate_nccl_dep_consistency("11.8") +======= +validate_nccl_dep_consistency("12.9") +validate_nccl_dep_consistency("12.8") +validate_nccl_dep_consistency("12.6") +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) diff --git a/.github/scripts/generate_ci_workflows.py b/.github/scripts/generate_ci_workflows.py index 520845413e20..da9fe66bbf1e 100755 --- a/.github/scripts/generate_ci_workflows.py +++ b/.github/scripts/generate_ci_workflows.py @@ -54,7 +54,10 @@ class BinaryBuildWorkflow: # Optional fields build_environment: str = "" +<<<<<<< HEAD abi_version: str = "" +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ciflow_config: CIFlowConfig = field(default_factory=CIFlowConfig) is_scheduled: str = "" branches: str = "nightly" @@ -62,6 +65,7 @@ class BinaryBuildWorkflow: cross_compile_arm64: bool = False macos_runner: str = "macos-14-xlarge" use_split_build: bool = False +<<<<<<< HEAD def __post_init__(self) -> None: if self.abi_version: @@ -70,6 +74,18 @@ def __post_init__(self) -> None: ) else: self.build_environment = f"{self.os}-binary-{self.package_type}" +======= + # Mainly used for libtorch builds + build_variant: str = "" + + def __post_init__(self) -> None: + if self.build_environment == "": + self.build_environment = "-".join( + item + for item in [self.os, "binary", self.package_type, self.build_variant] + if item != "" + ) +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if self.use_split_build: # added to distinguish concurrency groups self.build_environment += "-split" @@ -133,10 +149,16 @@ class OperatingSystem: BinaryBuildWorkflow( os=OperatingSystem.LINUX, package_type="libtorch", +<<<<<<< HEAD abi_version=generate_binary_build_matrix.CXX11_ABI, build_configs=generate_binary_build_matrix.generate_libtorch_matrix( OperatingSystem.LINUX, generate_binary_build_matrix.CXX11_ABI, +======= + build_configs=generate_binary_build_matrix.generate_libtorch_matrix( + OperatingSystem.LINUX, + generate_binary_build_matrix.RELEASE, +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) libtorch_variants=["shared-with-deps"], ), ciflow_config=CIFlowConfig( @@ -152,7 +174,11 @@ class OperatingSystem: package_type="manywheel", build_configs=generate_binary_build_matrix.generate_wheels_matrix( OperatingSystem.LINUX, +<<<<<<< HEAD arches=["11.8", "12.6", "12.8"], +======= + arches=["12.6", "12.8", "12.9", "6.4"], +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) python_versions=["3.9"], ), branches="main", @@ -176,10 +202,17 @@ class OperatingSystem: BinaryBuildWorkflow( os=OperatingSystem.LINUX, package_type="libtorch", +<<<<<<< HEAD abi_version=generate_binary_build_matrix.CXX11_ABI, build_configs=generate_binary_build_matrix.generate_libtorch_matrix( OperatingSystem.LINUX, generate_binary_build_matrix.CXX11_ABI, +======= + build_variant=generate_binary_build_matrix.RELEASE, + build_configs=generate_binary_build_matrix.generate_libtorch_matrix( + OperatingSystem.LINUX, + generate_binary_build_matrix.RELEASE, +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) arches=["cpu"], libtorch_variants=["shared-with-deps"], ), @@ -202,7 +235,11 @@ class OperatingSystem: BinaryBuildWorkflow( os=OperatingSystem.WINDOWS, package_type="libtorch", +<<<<<<< HEAD abi_version=generate_binary_build_matrix.RELEASE, +======= + build_variant=generate_binary_build_matrix.RELEASE, +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) build_configs=generate_binary_build_matrix.generate_libtorch_matrix( OperatingSystem.WINDOWS, generate_binary_build_matrix.RELEASE, @@ -216,7 +253,11 @@ class OperatingSystem: BinaryBuildWorkflow( os=OperatingSystem.WINDOWS, package_type="libtorch", +<<<<<<< HEAD abi_version=generate_binary_build_matrix.DEBUG, +======= + build_variant=generate_binary_build_matrix.DEBUG, +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) build_configs=generate_binary_build_matrix.generate_libtorch_matrix( OperatingSystem.WINDOWS, generate_binary_build_matrix.DEBUG, @@ -227,13 +268,63 @@ class OperatingSystem: isolated_workflow=True, ), ), +<<<<<<< HEAD +======= + BinaryBuildWorkflow( + os=OperatingSystem.WINDOWS_ARM64, + package_type="wheel", + build_configs=generate_binary_build_matrix.generate_wheels_matrix( + OperatingSystem.WINDOWS_ARM64, + arches=["cpu"], + python_versions=["3.11", "3.12", "3.13"], + ), + ciflow_config=CIFlowConfig( + labels={LABEL_CIFLOW_BINARIES, LABEL_CIFLOW_BINARIES_WHEEL}, + isolated_workflow=True, + ), + ), + BinaryBuildWorkflow( + os=OperatingSystem.WINDOWS_ARM64, + package_type="libtorch", + build_variant=generate_binary_build_matrix.RELEASE, + build_configs=generate_binary_build_matrix.generate_libtorch_matrix( + OperatingSystem.WINDOWS_ARM64, + generate_binary_build_matrix.RELEASE, + arches=["cpu"], + libtorch_variants=["shared-with-deps"], + ), + ciflow_config=CIFlowConfig( + labels={LABEL_CIFLOW_BINARIES, LABEL_CIFLOW_BINARIES_LIBTORCH}, + isolated_workflow=True, + ), + ), + BinaryBuildWorkflow( + os=OperatingSystem.WINDOWS_ARM64, + package_type="libtorch", + build_variant=generate_binary_build_matrix.DEBUG, + build_configs=generate_binary_build_matrix.generate_libtorch_matrix( + OperatingSystem.WINDOWS_ARM64, + generate_binary_build_matrix.DEBUG, + arches=["cpu"], + libtorch_variants=["shared-with-deps"], + ), + ciflow_config=CIFlowConfig( + labels={LABEL_CIFLOW_BINARIES, LABEL_CIFLOW_BINARIES_LIBTORCH}, + isolated_workflow=True, + ), + ), +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ] WINDOWS_BINARY_SMOKE_WORKFLOWS = [ BinaryBuildWorkflow( os=OperatingSystem.WINDOWS, package_type="libtorch", +<<<<<<< HEAD abi_version=generate_binary_build_matrix.RELEASE, +======= + build_variant=generate_binary_build_matrix.RELEASE, +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) build_configs=generate_binary_build_matrix.generate_libtorch_matrix( OperatingSystem.WINDOWS, generate_binary_build_matrix.RELEASE, @@ -248,7 +339,11 @@ class OperatingSystem: BinaryBuildWorkflow( os=OperatingSystem.WINDOWS, package_type="libtorch", +<<<<<<< HEAD abi_version=generate_binary_build_matrix.DEBUG, +======= + build_variant=generate_binary_build_matrix.DEBUG, +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) build_configs=generate_binary_build_matrix.generate_libtorch_matrix( OperatingSystem.WINDOWS, generate_binary_build_matrix.DEBUG, @@ -262,6 +357,7 @@ class OperatingSystem: ), ] +<<<<<<< HEAD WINDOWS_ARM64_BINARY_BUILD_WORKFLOWS = [ BinaryBuildWorkflow( os=OperatingSystem.WINDOWS_ARM64, @@ -308,14 +404,23 @@ class OperatingSystem: ), ] +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) MACOS_BINARY_BUILD_WORKFLOWS = [ BinaryBuildWorkflow( os=OperatingSystem.MACOS_ARM64, package_type="libtorch", +<<<<<<< HEAD abi_version=generate_binary_build_matrix.CXX11_ABI, build_configs=generate_binary_build_matrix.generate_libtorch_matrix( OperatingSystem.MACOS, generate_binary_build_matrix.CXX11_ABI, +======= + build_variant=generate_binary_build_matrix.RELEASE, + build_configs=generate_binary_build_matrix.generate_libtorch_matrix( + OperatingSystem.MACOS, + generate_binary_build_matrix.RELEASE, +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) libtorch_variants=["shared-with-deps"], ), cross_compile_arm64=False, @@ -403,10 +508,13 @@ def main() -> None: WINDOWS_BINARY_SMOKE_WORKFLOWS, ), ( +<<<<<<< HEAD jinja_env.get_template("windows_arm64_binary_build_workflow.yml.j2"), WINDOWS_ARM64_BINARY_BUILD_WORKFLOWS, ), ( +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) jinja_env.get_template("macos_binary_build_workflow.yml.j2"), MACOS_BINARY_BUILD_WORKFLOWS, ), diff --git a/.github/scripts/get_workflow_job_id.py b/.github/scripts/get_workflow_job_id.py index cfbfe315bf69..7c8cac54bff9 100644 --- a/.github/scripts/get_workflow_job_id.py +++ b/.github/scripts/get_workflow_job_id.py @@ -64,7 +64,11 @@ def fetch_url( ) exception_message = ( "Is github alright?", +<<<<<<< HEAD f"Recieved status code '{err.code}' when attempting to retrieve {url}:\n", +======= + f"Received status code '{err.code}' when attempting to retrieve {url}:\n", +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) f"{err.reason}\n\nheaders={err.headers}", ) raise RuntimeError(exception_message) from err diff --git a/.github/scripts/github_utils.py b/.github/scripts/github_utils.py index 3a42298cdf37..cef5aa20f950 100644 --- a/.github/scripts/github_utils.py +++ b/.github/scripts/github_utils.py @@ -128,7 +128,11 @@ def gh_fetch_json_dict( def gh_graphql(query: str, **kwargs: Any) -> dict[str, Any]: rc = gh_fetch_url( +<<<<<<< HEAD "https://api.github.com/graphql", +======= + "https://api.github.com/graphql", # @lint-ignore +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) data={"query": query, "variables": kwargs}, reader=json.load, ) diff --git a/.github/scripts/gitutils.py b/.github/scripts/gitutils.py index 43ee063bd634..4b572f2b681c 100644 --- a/.github/scripts/gitutils.py +++ b/.github/scripts/gitutils.py @@ -211,7 +211,11 @@ def compute_branch_diffs( self, from_branch: str, to_branch: str ) -> tuple[list[str], list[str]]: """ +<<<<<<< HEAD Returns list of commmits that are missing in each other branch since their merge base +======= + Returns list of commits that are missing in each other branch since their merge base +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) Might be slow if merge base is between two branches is pretty far off """ from_ref = self.rev_parse(from_branch) diff --git a/.github/scripts/label_utils.py b/.github/scripts/label_utils.py index 00c7cbf8e322..542acbd9b281 100644 --- a/.github/scripts/label_utils.py +++ b/.github/scripts/label_utils.py @@ -45,7 +45,11 @@ def get_last_page_num_from_header(header: Any) -> int: # rel="next", ; rel="last" link_info = header["link"] # Docs does not specify that it should be present for projects with just few labels +<<<<<<< HEAD # And https://github.com/malfet/deleteme/actions/runs/7334565243/job/19971396887 it's not the case +======= + # And https://github.com/malfet/deleteme/actions/runs/7334565243/job/19971396887 it's not the case # @lint-ignore +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if link_info is None: return 1 prefix = "&page=" diff --git a/.github/scripts/lintrunner.sh b/.github/scripts/lintrunner.sh index a3d78d116b3b..f1f479a0b3f9 100755 --- a/.github/scripts/lintrunner.sh +++ b/.github/scripts/lintrunner.sh @@ -1,6 +1,7 @@ #!/usr/bin/env bash set -ex +<<<<<<< HEAD # The generic Linux job chooses to use base env, not the one setup by the image CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]") eval "$(command conda 'shell.bash' 'hook' 2> /dev/null)" @@ -8,6 +9,10 @@ conda activate "${CONDA_ENV}" # Use uv to speed up lintrunner init python3 -m pip install uv==0.1.45 +======= +# Use uv to speed up lintrunner init +python3 -m pip install uv==0.1.45 setuptools +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) CACHE_DIRECTORY="/tmp/.lintbin" # Try to recover the cached binaries @@ -36,6 +41,12 @@ python3 -m tools.pyi.gen_pyi \ --deprecated-functions-path "tools/autograd/deprecated.yaml" python3 torch/utils/data/datapipes/gen_pyi.py +<<<<<<< HEAD +======= +# Also check generated pyi files +find torch -name '*.pyi' -exec git add --force -- "{}" + + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) RC=0 # Run lintrunner on all files if ! lintrunner --force-color --tee-json=lint.json ${ADDITIONAL_LINTRUNNER_ARGS} 2> /dev/null; then @@ -46,6 +57,12 @@ if ! lintrunner --force-color --tee-json=lint.json ${ADDITIONAL_LINTRUNNER_ARGS} RC=1 fi +<<<<<<< HEAD +======= +# Unstage temporally added pyi files +find torch -name '*.pyi' -exec git restore --staged -- "{}" + + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # Use jq to massage the JSON lint output into GitHub Actions workflow commands. jq --raw-output \ '"::\(if .severity == "advice" or .severity == "disabled" then "warning" else .severity end) file=\(.path),line=\(.line),col=\(.char),title=\(.code) \(.name)::" + (.description | gsub("\\n"; "%0A"))' \ diff --git a/.github/scripts/pr-sanity-check.sh b/.github/scripts/pr-sanity-check.sh index 2b33dd91f770..82a74b535a6e 100644 --- a/.github/scripts/pr-sanity-check.sh +++ b/.github/scripts/pr-sanity-check.sh @@ -12,7 +12,11 @@ BASE=${BASE:-HEAD~1} HEAD=${HEAD:-HEAD} ancestor=$(git merge-base "${BASE}" "${HEAD}") +<<<<<<< HEAD echo "INFO: Checking aginst the following stats" +======= +echo "INFO: Checking against the following stats" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ( set -x git diff --stat=10000 "$ancestor" "${HEAD}" | sed '$d' > "${TMPFILE}" diff --git a/.github/scripts/runner_determinator.py b/.github/scripts/runner_determinator.py index e6846e42475b..69045db55bd7 100644 --- a/.github/scripts/runner_determinator.py +++ b/.github/scripts/runner_determinator.py @@ -199,6 +199,19 @@ def parse_args() -> Any: help="comma separated list of experiments to check, if omitted all experiments marked with default=True are checked", ) parser.add_argument( +<<<<<<< HEAD +======= + "--opt-out-experiments", + type=_str_comma_separated_to_set, + required=False, + default="", + help=( + "comma separated list of experiments to opt-out of. If unset, no opt-outs will occur. " + "If the same experiment is listed both here and in '--eligible-experiments' opt-out will take priority." + ), + ) + parser.add_argument( +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) "--pr-number", type=str, required=False, @@ -422,6 +435,10 @@ def get_runner_prefix( workflow_requestors: Iterable[str], branch: str, eligible_experiments: frozenset[str] = frozenset(), +<<<<<<< HEAD +======= + opt_out_experiments: frozenset[str] = frozenset(), +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) is_canary: bool = False, ) -> str: settings = parse_settings(rollout_state) @@ -436,6 +453,17 @@ def get_runner_prefix( ) continue +<<<<<<< HEAD +======= + if opt_out_experiments: + if experiment_name in opt_out_experiments: + opt_out_exp_list = ", ".join(opt_out_experiments) + log.info( + f"Skipping experiment '{experiment_name}', as this workflow has opted-out (opted out experiments are: {opt_out_exp_list})" + ) + continue + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if eligible_experiments: if experiment_name not in eligible_experiments: exp_list = ", ".join(eligible_experiments) @@ -600,6 +628,10 @@ def main() -> None: (args.github_issue_owner, username), args.github_branch, args.eligible_experiments, +<<<<<<< HEAD +======= + args.opt_out_experiments, +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) is_canary, ) diff --git a/.github/scripts/s390x-ci/self-hosted-builder/actions-runner.Dockerfile b/.github/scripts/s390x-ci/self-hosted-builder/actions-runner.Dockerfile index 7e7f47a459f3..0b0bf7f5d020 100644 --- a/.github/scripts/s390x-ci/self-hosted-builder/actions-runner.Dockerfile +++ b/.github/scripts/s390x-ci/self-hosted-builder/actions-runner.Dockerfile @@ -5,6 +5,53 @@ FROM --platform=linux/amd64 docker.io/ubuntu:24.04 as ld-prefix ENV DEBIAN_FRONTEND=noninteractive RUN apt-get update && apt-get -y install ca-certificates libicu74 libssl3 +<<<<<<< HEAD +======= +# Patched podman +FROM --platform=linux/s390x docker.io/ubuntu:24.04 as podman +ENV DEBIAN_FRONTEND=noninteractive +RUN sed -i 's/^Types: deb$/Types: deb deb-src/' /etc/apt/sources.list.d/ubuntu.sources +RUN apt-get update && \ + apt-get install -y \ + cmake \ + curl \ + devscripts \ + dpkg-dev \ + gdb \ + less \ + make \ + python3 \ + python3-pip \ + quilt \ + rsync \ + software-properties-common \ + stress-ng \ + vim \ + nano \ + wget && \ + apt-get build-dep -y podman && \ + apt-get source podman + +COPY podman-patches/podman-25245.patch /tmp/podman-25245.patch +COPY podman-patches/podman-25102-backport.patch /tmp/podman-25102-backport.patch + +# import and apply patches +# patches: +# https://github.com/containers/podman/pull/25102 +# https://github.com/containers/podman/pull/25245 +RUN cd /libpod-* && \ + quilt import /tmp/podman-25245.patch && quilt push && \ + quilt import /tmp/podman-25102-backport.patch && quilt push && \ + dch -i "Fix podman deadlock and add option to clean up build leftovers" && \ + /bin/rm /tmp/podman-25245.patch /tmp/podman-25102-backport.patch + +# build patched podman +RUN cd /libpod-* && \ + debuild -i -us -uc -b && \ + /bin/rm /podman-remote_*.deb && \ + mkdir /tmp/podman && cp -v /podman*.deb /tmp/podman + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # Main image. FROM --platform=linux/s390x docker.io/ubuntu:24.04 @@ -45,7 +92,15 @@ COPY fs/ / RUN chmod +x /usr/bin/actions-runner /usr/bin/entrypoint # install podman +<<<<<<< HEAD RUN apt -y install podman podman-docker +======= +# RUN apt-get update && apt -y install podman podman-docker + +# install patched podman +COPY --from=podman /tmp/podman /tmp/podman +RUN apt-get update && apt -y install /tmp/podman/*.deb && /bin/rm -rfv /tmp/podman +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # amd64 Github Actions Runner. RUN useradd -m actions-runner @@ -65,7 +120,11 @@ RUN virtualenv --system-site-packages venv # COPY --chown=actions-runner:actions-runner manywheel-s390x.tar /home/actions-runner/manywheel-s390x.tar +<<<<<<< HEAD RUN curl -L https://github.com/actions/runner/releases/download/v2.317.0/actions-runner-linux-x64-2.317.0.tar.gz | tar -xz +======= +RUN curl -L https://github.com/actions/runner/releases/download/v2.322.0/actions-runner-linux-x64-2.322.0.tar.gz | tar -xz +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ENTRYPOINT ["/usr/bin/entrypoint"] CMD ["/usr/bin/actions-runner"] diff --git a/.github/scripts/s390x-ci/self-hosted-builder/fs/usr/bin/actions-runner b/.github/scripts/s390x-ci/self-hosted-builder/fs/usr/bin/actions-runner index 0fe99fe85da7..121f18c71a19 100644 --- a/.github/scripts/s390x-ci/self-hosted-builder/fs/usr/bin/actions-runner +++ b/.github/scripts/s390x-ci/self-hosted-builder/fs/usr/bin/actions-runner @@ -27,6 +27,12 @@ unset ACCESS_TOKEN # it does one job, stops and unregisters registration_token=$(jq --raw-output .token "$token_file") +<<<<<<< HEAD +======= +# workaround for https://gitlab.com/qemu-project/qemu/-/issues/2600 +export DOTNET_EnableWriteXorExecute=0 + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ./config.sh \ --unattended \ --ephemeral \ @@ -44,8 +50,11 @@ rm -f "$token_file" # and it doesn't work for non-root user source venv/bin/activate +<<<<<<< HEAD # workaround for https://gitlab.com/qemu-project/qemu/-/issues/2600 export DOTNET_EnableWriteXorExecute=0 +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # Run one job. ./run.sh diff --git a/.github/scripts/s390x-ci/self-hosted-builder/podman-patches/podman-25102-backport.patch b/.github/scripts/s390x-ci/self-hosted-builder/podman-patches/podman-25102-backport.patch new file mode 100644 index 000000000000..16dc10e85f6d --- /dev/null +++ b/.github/scripts/s390x-ci/self-hosted-builder/podman-patches/podman-25102-backport.patch @@ -0,0 +1,358 @@ +diff --git a/cmd/podman/system/prune.go b/cmd/podman/system/prune.go +index f7cf7b551..739f87cde 100644 +--- a/cmd/podman/system/prune.go ++++ b/cmd/podman/system/prune.go +@@ -48,6 +48,7 @@ func init() { + flags.BoolVarP(&force, "force", "f", false, "Do not prompt for confirmation. The default is false") + flags.BoolVarP(&pruneOptions.All, "all", "a", false, "Remove all unused data") + flags.BoolVar(&pruneOptions.External, "external", false, "Remove container data in storage not controlled by podman") ++ flags.BoolVar(&pruneOptions.Build, "build", false, "Remove build containers") + flags.BoolVar(&pruneOptions.Volume, "volumes", false, "Prune volumes") + filterFlagName := "filter" + flags.StringArrayVar(&filters, filterFlagName, []string{}, "Provide filter values (e.g. 'label==')") +@@ -64,8 +65,12 @@ func prune(cmd *cobra.Command, args []string) error { + volumeString = ` + - all volumes not used by at least one container` + } +- +- fmt.Printf(createPruneWarningMessage(pruneOptions), volumeString, "Are you sure you want to continue? [y/N] ") ++ buildString := "" ++ if pruneOptions.Build { ++ buildString = ` ++ - all build containers` ++ } ++ fmt.Printf(createPruneWarningMessage(pruneOptions), volumeString, buildString, "Are you sure you want to continue? [y/N] ") + + answer, err := reader.ReadString('\n') + if err != nil { +@@ -124,7 +129,7 @@ func createPruneWarningMessage(pruneOpts entities.SystemPruneOptions) string { + if pruneOpts.All { + return `WARNING! This command removes: + - all stopped containers +- - all networks not used by at least one container%s ++ - all networks not used by at least one container%s%s + - all images without at least one container associated with them + - all build cache + +@@ -132,7 +137,7 @@ func createPruneWarningMessage(pruneOpts entities.SystemPruneOptions) string { + } + return `WARNING! This command removes: + - all stopped containers +- - all networks not used by at least one container%s ++ - all networks not used by at least one container%s%s + - all dangling images + - all dangling build cache + +diff --git a/docs/source/markdown/podman-system-prune.1.md b/docs/source/markdown/podman-system-prune.1.md +index 52f9ec1c7..95099d018 100644 +--- a/docs/source/markdown/podman-system-prune.1.md ++++ b/docs/source/markdown/podman-system-prune.1.md +@@ -7,20 +7,28 @@ podman\-system\-prune - Remove all unused pods, containers, images, networks, an + **podman system prune** [*options*] + + ## DESCRIPTION +-**podman system prune** removes all unused containers (both dangling and unreferenced), pods, networks, and optionally, volumes from local storage. ++**podman system prune** removes all unused containers (both dangling and unreferenced), build containers, pods, networks, and optionally, volumes from local storage. + + Use the **--all** option to delete all unused images. Unused images are dangling images as well as any image that does not have any containers based on it. + + By default, volumes are not removed to prevent important data from being deleted if there is currently no container using the volume. Use the **--volumes** flag when running the command to prune volumes as well. + ++By default, build containers are not removed to prevent interference with builds in progress. Use the **--build** flag when running the command to remove build containers as well. ++ + ## OPTIONS + #### **--all**, **-a** + + Recursively remove all unused pods, containers, images, networks, and volume data. (Maximum 50 iterations.) + ++#### **--build** ++ ++Removes any build containers that were created during the build, but were not removed because the build was unexpectedly terminated. ++ ++Note: **This is not safe operation and should be executed only when no builds are in progress. It can interfere with builds in progress.** ++ + #### **--external** + +-Removes all leftover container storage files from local storage not managed by Podman. In normal circumstances, no such data exists, but in case of an unclean shutdown, the Podman database may be corrupted and cause this. ++Tries to clean up remainders of previous containers or layers that are not references in the storage json files. These can happen in the case of unclean shutdowns or regular restarts in transient storage mode. + + However, when using transient storage mode, the Podman database does not persist. This means containers leave the writable layers on disk after a reboot. When using a transient store, it is recommended that the **podman system prune --external** command is run during boot. + +diff --git a/libpod/runtime.go b/libpod/runtime.go +index 986e40f60..609fbba57 100644 +--- a/libpod/runtime.go ++++ b/libpod/runtime.go +@@ -33,6 +33,7 @@ import ( + "github.com/containers/podman/v4/libpod/lock" + "github.com/containers/podman/v4/libpod/plugin" + "github.com/containers/podman/v4/libpod/shutdown" ++ "github.com/containers/podman/v4/pkg/domain/entities/reports" + "github.com/containers/podman/v4/pkg/rootless" + "github.com/containers/podman/v4/pkg/systemd" + "github.com/containers/podman/v4/pkg/util" +@@ -1250,3 +1251,52 @@ func (r *Runtime) LockConflicts() (map[uint32][]string, []uint32, error) { + + return toReturn, locksHeld, nil + } ++ ++// Exists checks whether a file or directory exists at the given path. ++// If the path is a symlink, the symlink is followed. ++func Exists(path string) error { ++ // It uses unix.Faccessat which is a faster operation compared to os.Stat for ++ // simply checking the existence of a file. ++ err := unix.Faccessat(unix.AT_FDCWD, path, unix.F_OK, 0) ++ if err != nil { ++ return &os.PathError{Op: "faccessat", Path: path, Err: err} ++ } ++ return nil ++} ++ ++// PruneBuildContainers removes any build containers that were created during the build, ++// but were not removed because the build was unexpectedly terminated. ++// ++// Note: This is not safe operation and should be executed only when no builds are in progress. It can interfere with builds in progress. ++func (r *Runtime) PruneBuildContainers() ([]*reports.PruneReport, error) { ++ stageContainersPruneReports := []*reports.PruneReport{} ++ ++ containers, err := r.store.Containers() ++ if err != nil { ++ return stageContainersPruneReports, err ++ } ++ for _, container := range containers { ++ path, err := r.store.ContainerDirectory(container.ID) ++ if err != nil { ++ return stageContainersPruneReports, err ++ } ++ if err := Exists(filepath.Join(path, "buildah.json")); err != nil { ++ continue ++ } ++ ++ report := &reports.PruneReport{ ++ Id: container.ID, ++ } ++ size, err := r.store.ContainerSize(container.ID) ++ if err != nil { ++ report.Err = err ++ } ++ report.Size = uint64(size) ++ ++ if err := r.store.DeleteContainer(container.ID); err != nil { ++ report.Err = errors.Join(report.Err, err) ++ } ++ stageContainersPruneReports = append(stageContainersPruneReports, report) ++ } ++ return stageContainersPruneReports, nil ++} +diff --git a/pkg/api/handlers/libpod/system.go b/pkg/api/handlers/libpod/system.go +index 70d4493f8..7c129b1ba 100644 +--- a/pkg/api/handlers/libpod/system.go ++++ b/pkg/api/handlers/libpod/system.go +@@ -22,6 +22,7 @@ func SystemPrune(w http.ResponseWriter, r *http.Request) { + All bool `schema:"all"` + Volumes bool `schema:"volumes"` + External bool `schema:"external"` ++ Build bool `schema:"build"` + }{} + + if err := decoder.Decode(&query, r.URL.Query()); err != nil { +@@ -43,6 +44,7 @@ func SystemPrune(w http.ResponseWriter, r *http.Request) { + Volume: query.Volumes, + Filters: *filterMap, + External: query.External, ++ Build: query.Build, + } + report, err := containerEngine.SystemPrune(r.Context(), pruneOptions) + if err != nil { +diff --git a/pkg/bindings/system/types.go b/pkg/bindings/system/types.go +index 89e093f68..b4a4ff064 100644 +--- a/pkg/bindings/system/types.go ++++ b/pkg/bindings/system/types.go +@@ -18,6 +18,7 @@ type PruneOptions struct { + Filters map[string][]string + Volumes *bool + External *bool ++ Build *bool + } + + // VersionOptions are optional options for getting version info +diff --git a/pkg/bindings/system/types_prune_options.go b/pkg/bindings/system/types_prune_options.go +index d00498520..5f3bd652c 100644 +--- a/pkg/bindings/system/types_prune_options.go ++++ b/pkg/bindings/system/types_prune_options.go +@@ -76,3 +76,18 @@ func (o *PruneOptions) GetExternal() bool { + } + return *o.External + } ++ ++// WithBuild set field Build to given value ++func (o *PruneOptions) WithBuild(value bool) *PruneOptions { ++ o.Build = &value ++ return o ++} ++ ++// GetBuild returns value of field Build ++func (o *PruneOptions) GetBuild() bool { ++ if o.Build == nil { ++ var z bool ++ return z ++ } ++ return *o.Build ++} +diff --git a/pkg/domain/entities/system.go b/pkg/domain/entities/system.go +index 473db3530..f6938652a 100644 +--- a/pkg/domain/entities/system.go ++++ b/pkg/domain/entities/system.go +@@ -22,6 +22,7 @@ type SystemPruneOptions struct { + Volume bool + Filters map[string][]string `json:"filters" schema:"filters"` + External bool ++ Build bool + } + + // SystemPruneReport provides report after system prune is executed. +diff --git a/pkg/domain/infra/abi/system.go b/pkg/domain/infra/abi/system.go +index 24ee64d29..ea3e5f203 100644 +--- a/pkg/domain/infra/abi/system.go ++++ b/pkg/domain/infra/abi/system.go +@@ -150,16 +150,16 @@ func (ic *ContainerEngine) SetupRootless(_ context.Context, noMoveProcess bool) + return nil + } + +-// SystemPrune removes unused data from the system. Pruning pods, containers, networks, volumes and images. ++// SystemPrune removes unused data from the system. Pruning pods, containers, build container, networks, volumes and images. + func (ic *ContainerEngine) SystemPrune(ctx context.Context, options entities.SystemPruneOptions) (*entities.SystemPruneReport, error) { + var systemPruneReport = new(entities.SystemPruneReport) + + if options.External { +- if options.All || options.Volume || len(options.Filters) > 0 { ++ if options.All || options.Volume || len(options.Filters) > 0 || options.Build { + return nil, fmt.Errorf("system prune --external cannot be combined with other options") + } +- err := ic.Libpod.GarbageCollect() +- if err != nil { ++ ++ if err := ic.Libpod.GarbageCollect(); err != nil { + return nil, err + } + return systemPruneReport, nil +@@ -170,6 +170,17 @@ func (ic *ContainerEngine) SystemPrune(ctx context.Context, options entities.Sys + filters = append(filters, fmt.Sprintf("%s=%s", k, v[0])) + } + reclaimedSpace := (uint64)(0) ++ ++ // Prune Build Containers ++ if options.Build { ++ stageContainersPruneReports, err := ic.Libpod.PruneBuildContainers() ++ if err != nil { ++ return nil, err ++ } ++ reclaimedSpace += reports.PruneReportsSize(stageContainersPruneReports) ++ systemPruneReport.ContainerPruneReports = append(systemPruneReport.ContainerPruneReports, stageContainersPruneReports...) ++ } ++ + found := true + for found { + found = false +diff --git a/pkg/domain/infra/tunnel/system.go b/pkg/domain/infra/tunnel/system.go +index fc82e7b2b..142a9fa5c 100644 +--- a/pkg/domain/infra/tunnel/system.go ++++ b/pkg/domain/infra/tunnel/system.go +@@ -19,7 +19,7 @@ func (ic *ContainerEngine) SetupRootless(_ context.Context, noMoveProcess bool) + + // SystemPrune prunes unused data from the system. + func (ic *ContainerEngine) SystemPrune(ctx context.Context, opts entities.SystemPruneOptions) (*entities.SystemPruneReport, error) { +- options := new(system.PruneOptions).WithAll(opts.All).WithVolumes(opts.Volume).WithFilters(opts.Filters).WithExternal(opts.External) ++ options := new(system.PruneOptions).WithAll(opts.All).WithVolumes(opts.Volume).WithFilters(opts.Filters).WithExternal(opts.External).WithBuild(opts.Build) + return system.Prune(ic.ClientCtx, options) + } + +diff --git a/test/e2e/prune_test.go b/test/e2e/prune_test.go +index 01e848478..57bd5582d 100644 +--- a/test/e2e/prune_test.go ++++ b/test/e2e/prune_test.go +@@ -4,6 +4,8 @@ import ( + "fmt" + "os" + "path/filepath" ++ "syscall" ++ "time" + + . "github.com/containers/podman/v4/test/utils" + . "github.com/onsi/ginkgo/v2" +@@ -22,6 +24,11 @@ FROM scratch + ENV test1=test1 + ENV test2=test2` + ++var longBuildImage = fmt.Sprintf(` ++FROM %s ++RUN echo "Hello, World!" ++RUN RUN echo "Please use signal 9 this will never ends" && sleep 10000s`, ALPINE) ++ + var _ = Describe("Podman prune", func() { + + It("podman container prune containers", func() { +@@ -593,4 +600,63 @@ var _ = Describe("Podman prune", func() { + Expect(err).ToNot(HaveOccurred()) + Expect(dirents).To(HaveLen(3)) + }) ++ ++ It("podman system prune --build clean up after terminated build", func() { ++ useCustomNetworkDir(podmanTest, tempdir) ++ ++ podmanTest.BuildImage(pruneImage, "alpine_notleaker:latest", "false") ++ ++ create := podmanTest.Podman([]string{"create", "--name", "test", BB, "sleep", "10000"}) ++ create.WaitWithDefaultTimeout() ++ Expect(create).Should(ExitCleanly()) ++ ++ containerFilePath := filepath.Join(podmanTest.TempDir, "ContainerFile-podman-leaker") ++ err := os.WriteFile(containerFilePath, []byte(longBuildImage), 0755) ++ Expect(err).ToNot(HaveOccurred()) ++ ++ build := podmanTest.Podman([]string{"build", "-f", containerFilePath, "-t", "podmanleaker"}) ++ // Build will never finish so let's wait for build to ask for SIGKILL to simulate a failed build that leaves stage containers. ++ matchedOutput := false ++ for range 900 { ++ if build.LineInOutputContains("Please use signal 9") { ++ matchedOutput = true ++ build.Signal(syscall.SIGKILL) ++ break ++ } ++ time.Sleep(100 * time.Millisecond) ++ } ++ if !matchedOutput { ++ Fail("Did not match special string in podman build") ++ } ++ ++ // Check Intermediate image of stage container ++ none := podmanTest.Podman([]string{"images", "-a"}) ++ none.WaitWithDefaultTimeout() ++ Expect(none).Should(ExitCleanly()) ++ Expect(none.OutputToString()).Should(ContainSubstring("none")) ++ ++ // Check if Container and Stage Container exist ++ count := podmanTest.Podman([]string{"ps", "-aq", "--external"}) ++ count.WaitWithDefaultTimeout() ++ Expect(count).Should(ExitCleanly()) ++ Expect(count.OutputToStringArray()).To(HaveLen(3)) ++ ++ prune := podmanTest.Podman([]string{"system", "prune", "--build", "-f"}) ++ prune.WaitWithDefaultTimeout() ++ Expect(prune).Should(ExitCleanly()) ++ ++ // Container should still exist, but no stage containers ++ count = podmanTest.Podman([]string{"ps", "-aq", "--external"}) ++ count.WaitWithDefaultTimeout() ++ Expect(count).Should(ExitCleanly()) ++ Expect(count.OutputToString()).To(BeEmpty()) ++ ++ Expect(podmanTest.NumberOfContainers()).To(Equal(0)) ++ ++ after := podmanTest.Podman([]string{"images", "-a"}) ++ after.WaitWithDefaultTimeout() ++ Expect(after).Should(ExitCleanly()) ++ Expect(after.OutputToString()).ShouldNot(ContainSubstring("none")) ++ Expect(after.OutputToString()).Should(ContainSubstring("notleaker")) ++ }) + }) + diff --git a/.github/scripts/s390x-ci/self-hosted-builder/podman-patches/podman-25245.patch b/.github/scripts/s390x-ci/self-hosted-builder/podman-patches/podman-25245.patch new file mode 100644 index 000000000000..bf79f7904035 --- /dev/null +++ b/.github/scripts/s390x-ci/self-hosted-builder/podman-patches/podman-25245.patch @@ -0,0 +1,21 @@ +diff --git a/pkg/rootless/rootless_linux.c b/pkg/rootless/rootless_linux.c +index 4f71d49e5c..3d74af6a6c 100644 +--- a/pkg/rootless/rootless_linux.c ++++ b/pkg/rootless/rootless_linux.c +@@ -658,7 +658,7 @@ create_pause_process (const char *pause_pid_file_path, char **argv) + if (pipe (p) < 0) + return -1; + +- pid = fork (); ++ pid = syscall_clone (SIGCHLD, NULL); + if (pid < 0) + { + close (p[0]); +@@ -689,7 +689,7 @@ create_pause_process (const char *pause_pid_file_path, char **argv) + close (p[0]); + + setsid (); +- pid = fork (); ++ pid = syscall_clone (SIGCHLD, NULL); + if (pid < 0) + _exit (EXIT_FAILURE); diff --git a/.github/scripts/test_filter_test_configs.py b/.github/scripts/test_filter_test_configs.py index 378f72237601..15bcd3ef6873 100755 --- a/.github/scripts/test_filter_test_configs.py +++ b/.github/scripts/test_filter_test_configs.py @@ -347,26 +347,46 @@ def test_set_periodic_modes(self) -> None: { "job_name": "a-ci-job", "test_matrix": '{include: [{config: "default", runner: "linux"}, {config: "cfg", runner: "macos"}]}', +<<<<<<< HEAD "descripion": "Replicate each periodic mode in a different config", +======= + "description": "Replicate each periodic mode in a different config", +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) }, { "job_name": "a-ci-cuda11.8-job", "test_matrix": '{include: [{config: "default", runner: "linux"}, {config: "cfg", runner: "macos"}]}', +<<<<<<< HEAD "descripion": "Replicate each periodic mode in a different config for a CUDA job", +======= + "description": "Replicate each periodic mode in a different config for a CUDA job", +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) }, { "job_name": "a-ci-rocm-job", "test_matrix": '{include: [{config: "default", runner: "linux"}, {config: "cfg", runner: "macos"}]}', +<<<<<<< HEAD "descripion": "Replicate each periodic mode in a different config for a ROCm job", +======= + "description": "Replicate each periodic mode in a different config for a ROCm job", +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) }, { "job_name": "", "test_matrix": '{include: [{config: "default", runner: "linux"}, {config: "cfg", runner: "macos"}]}', +<<<<<<< HEAD "descripion": "Empty job name", }, { "test_matrix": '{include: [{config: "default", runner: "linux"}, {config: "cfg", runner: "macos"}]}', "descripion": "Missing job name", +======= + "description": "Empty job name", + }, + { + "test_matrix": '{include: [{config: "default", runner: "linux"}, {config: "cfg", runner: "macos"}]}', + "description": "Missing job name", +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) }, ] @@ -807,7 +827,11 @@ def test_parse_reenabled_issues(self) -> None: # test bad things pr_body = ( "fixes189 fixeshttps://github.com/pytorch/pytorch/issues/75123 " +<<<<<<< HEAD "closedhttps://githubcom/pytorch/pytorch/issues/75123" +======= + "closedhttps://githubcom/pytorch/pytorch/issues/75123" # @lint-ignore +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) "fix 234, fixes # 45, fixing #123, close 234, closes#45, closing #123 resolve 234, " "resolves #45, resolving #123" ) diff --git a/.github/scripts/test_trymerge.py b/.github/scripts/test_trymerge.py index 1a152dc95945..225546e48720 100755 --- a/.github/scripts/test_trymerge.py +++ b/.github/scripts/test_trymerge.py @@ -19,6 +19,10 @@ from github_utils import gh_graphql from gitutils import get_git_remote_name, get_git_repo_dir, GitRepo from trymerge import ( +<<<<<<< HEAD +======= + _revlist_to_prs, +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) categorize_checks, DRCI_CHECKRUN_NAME, find_matching_merge_rule, @@ -264,7 +268,11 @@ def commits_resolving_gh_pr(self, pr_num: int) -> list[str]: return ["FakeCommitSha"] def commit_message(self, ref: str) -> str: +<<<<<<< HEAD return "super awsome commit message" +======= + return "super awesome commit message" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) @mock.patch("trymerge.gh_graphql", side_effect=mocked_gh_graphql) @@ -432,7 +440,11 @@ def test_get_checkruns_many_runs(self, *args: Any) -> None: ) def test_cancelled_gets_ignored(self, *args: Any) -> None: +<<<<<<< HEAD """Tests that cancelled workflow does not override existing successfull status""" +======= + """Tests that cancelled workflow does not override existing successful status""" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) pr = GitHubPR("pytorch", "pytorch", 110367) conclusions = pr.get_checkrun_conclusions() lint_checks = [name for name in conclusions.keys() if "Lint" in name] @@ -1088,5 +1100,54 @@ def test_merge_ghstack_into( ) +<<<<<<< HEAD +======= +@mock.patch("trymerge.gh_graphql", side_effect=mocked_gh_graphql) +@mock.patch("trymerge.gh_fetch_merge_base", return_value="") +@mock.patch( + "trymerge.get_drci_classifications", side_effect=mocked_drci_classifications +) +@mock.patch.object(DummyGitRepo, "commit_message") +class TestRevListToPR(TestCase): + # Tests for _revlist_to_prs function + def test__revlist_to_prs_zero_matches( + self, mock_commit_message: mock.MagicMock, *args: Any + ) -> None: + # If zero PRs are mentioned in the commit message, it should raise an error + pr_num = 154098 + pr = GitHubPR("pytorch", "pytorch", pr_num) + repo = DummyGitRepo() + mock_commit_message.return_value = "no PRs" + self.assertRaisesRegex( + RuntimeError, + "PRs mentioned in commit dummy: 0.", + lambda: _revlist_to_prs(repo, pr, ["dummy"]), + ) + + def test__revlist_to_prs_two_prs( + self, mock_commit_message: mock.MagicMock, *args: Any + ) -> None: + # If two PRs are mentioned in the commit message, it should raise an error + pr_num = 154394 + pr = GitHubPR("pytorch", "pytorch", pr_num) + repo = DummyGitRepo() + # https://github.com/pytorch/pytorch/commit/343c56e7650f55fd030aca0b9275d6d73501d3f4 + + commit_message = """add sticky cache pgo + +ghstack-source-id: 9bc6dee0b427819f978bfabccb72727ba8be2f81 +Pull-Request-resolved: https://github.com/pytorch/pytorch/pull/154098 + +ghstack-source-id: 9bc6dee0b427819f978bfabccb72727ba8be2f81 +Pull Request resolved: https://github.com/pytorch/pytorch/pull/154394""" + mock_commit_message.return_value = commit_message + self.assertRaisesRegex( + RuntimeError, + "PRs mentioned in commit dummy: 2.", + lambda: _revlist_to_prs(repo, pr, ["dummy"]), + ) + + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if __name__ == "__main__": main() diff --git a/.github/scripts/trymerge.py b/.github/scripts/trymerge.py index e43494e31301..0f4889b10516 100755 --- a/.github/scripts/trymerge.py +++ b/.github/scripts/trymerge.py @@ -434,7 +434,11 @@ def __init__(self, name: str, url: str, run_id: int, status: Optional[str]): RE_GHSTACK_HEAD_REF = re.compile(r"^(gh/[^/]+/[0-9]+/)head$") RE_GHSTACK_DESC = re.compile(r"Stack.*:\r?\n(\* [^\r\n]+\r?\n)+", re.MULTILINE) RE_PULL_REQUEST_RESOLVED = re.compile( +<<<<<<< HEAD r"Pull Request resolved: " +======= + r"(Pull Request resolved|Pull-Request-resolved|Pull-Request): " +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) r"https://github.com/(?P[^/]+)/(?P[^/]+)/pull/(?P[0-9]+)", re.MULTILINE, ) @@ -628,11 +632,25 @@ def _revlist_to_prs( rc: list[tuple[GitHubPR, str]] = [] for idx, rev in enumerate(rev_list): msg = repo.commit_message(rev) +<<<<<<< HEAD m = RE_PULL_REQUEST_RESOLVED.search(msg) if m is None: raise RuntimeError( f"Could not find PR-resolved string in {msg} of ghstacked PR {pr.pr_num}" ) +======= + # findall doesn't return named captures, so we need to use finditer + all_matches = list(RE_PULL_REQUEST_RESOLVED.finditer(msg)) + if len(all_matches) != 1: + raise RuntimeError( + f"Found an unexpected number of PRs mentioned in commit {rev}: " + f"{len(all_matches)}. This is probably because you are using an " + "old version of ghstack. Please update ghstack and resubmit " + "your PRs" + ) + + m = all_matches[0] +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if pr.org != m.group("owner") or pr.project != m.group("repo"): raise RuntimeError( f"PR {m.group('number')} resolved to wrong owner/repo pair" @@ -666,6 +684,12 @@ def skip_func(idx: int, candidate: "GitHubPR") -> bool: assert pr.is_ghstack_pr() entire_stack = _revlist_to_prs(repo, pr, reversed(rev_list), skip_func) +<<<<<<< HEAD +======= + print( + f"Found {len(entire_stack)} PRs in the stack for {pr.pr_num}: {[x[0].pr_num for x in entire_stack]}" + ) +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) for stacked_pr, rev in entire_stack: if stacked_pr.is_closed(): @@ -819,10 +843,16 @@ def _get_reviews(self) -> list[tuple[str, str]]: cursor=info["reviews"]["pageInfo"]["startCursor"], ) info = rc["data"]["repository"]["pullRequest"] +<<<<<<< HEAD reviews = {} for author, state in self._reviews: if state != "COMMENTED": reviews[author] = state +======= + reviews = { + author: state for author, state in self._reviews if state != "COMMENTED" + } +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return list(reviews.items()) def get_approved_by(self) -> list[str]: @@ -940,6 +970,15 @@ def get_pr_next_checksuites(checksuites: Any) -> Any: summary=None, ) +<<<<<<< HEAD +======= + # Making an exception for Apply lint auggestions/autoformat because the + # bot adds a merged label -> triggers workflow -> sometimes needs + # approval -> is read as failure, which results in a blocked merge, but + # this workflow doesn't provide mergability info + self.conclusions.pop("Apply lint suggestions", None) + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return self.conclusions def get_authors(self) -> dict[str, str]: @@ -1939,6 +1978,10 @@ def get_ghstack_dependent_prs( def do_revert_prs( repo: GitRepo, +<<<<<<< HEAD +======= + original_pr: GitHubPR, +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) shas_and_prs: list[tuple[str, GitHubPR]], *, author_login: str, @@ -1960,9 +2003,22 @@ def do_revert_prs( # Comment/reopen PRs for commit_sha, pr in shas_and_prs: +<<<<<<< HEAD revert_message = ( f"@{pr.get_pr_creator_login()} your PR has been successfully reverted." ) +======= + revert_message = "" + if pr.pr_num == original_pr.pr_num: + revert_message += ( + f"@{pr.get_pr_creator_login()} your PR has been successfully reverted." + ) + else: + revert_message += ( + f"@{pr.get_pr_creator_login()} your PR has been reverted as part of the stack under " + f"#{original_pr.pr_num}.\n" + ) +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if ( pr.has_internal_changes() and not pr.has_no_connected_diff() @@ -2014,6 +2070,10 @@ def try_revert( do_revert_prs( repo, +<<<<<<< HEAD +======= + pr, +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) shas_and_prs, author_login=author_login, extra_msg=extra_msg, @@ -2032,7 +2092,11 @@ def check_for_sev(org: str, project: str, skip_mandatory_checks: bool) -> None: response = cast( dict[str, Any], gh_fetch_json_list( +<<<<<<< HEAD "https://api.github.com/search/issues", +======= + "https://api.github.com/search/issues", # @lint-ignore +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # Having two label: queries is an AND operation params={ "q": f'repo:{org}/{project} is:open is:issue label:"ci: sev" label:"merge blocking"' @@ -2282,7 +2346,12 @@ def merge( except MandatoryChecksMissingError as ex: last_exception = str(ex) print( +<<<<<<< HEAD f"Merge of https://github.com/{pr.org}/{pr.project}/pull/{pr.pr_num} failed due to: {ex}. Retrying in 5 min" +======= + f"Merge of https://github.com/{pr.org}/{pr.project}/pull/{pr.pr_num} failed due to: {ex}. Retrying in 5 min", + flush=True, +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ) time.sleep(5 * 60) # Finally report timeout back diff --git a/.github/scripts/tryrebase.py b/.github/scripts/tryrebase.py index 0f6d74e8346e..ddeff1d62d98 100755 --- a/.github/scripts/tryrebase.py +++ b/.github/scripts/tryrebase.py @@ -132,17 +132,30 @@ def rebase_ghstack_onto( # The contents of a successful push result should look like: # Summary of changes (ghstack 0.6.0) +<<<<<<< HEAD # - Updated https://github.com/clee2000/random-testing/pull/2 # - Updated https://github.com/clee2000/random-testing/pull/1 +======= + # - Updated https://github.com/clee2000/random-testing-public/pull/2 + # - Updated https://github.com/clee2000/random-testing-public/pull/1 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # Facebook employees can import your changes by running # (on a Facebook machine): +<<<<<<< HEAD # ghimport -s https://github.com/clee2000/random-testing/pull/2 # If you want to work on this diff stack on another machine: # ghstack checkout https://github.com/clee2000/random-testing/pull/2 +======= + # ghimport -s https://github.com/clee2000/random-testing-public/pull/2 + + # If you want to work on this diff stack on another machine: + + # ghstack checkout https://github.com/clee2000/random-testing-public/pull/2 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) org, project = repo.gh_owner_and_name() for line in push_result.splitlines(): if "Updated" in line: diff --git a/.github/scripts/windows/build_magma.bat b/.github/scripts/windows/build_magma.bat index beabb0070554..b203f9fe64ee 100644 --- a/.github/scripts/windows/build_magma.bat +++ b/.github/scripts/windows/build_magma.bat @@ -17,7 +17,10 @@ if errorlevel 1 exit /b 1 set "PATH=C:\Tools;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v%CUVER%\bin;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v%CUVER%\libnvvp;%PATH%" set CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v%CUVER% +<<<<<<< HEAD set NVTOOLSEXT_PATH=C:\Program Files\NVIDIA Corporation\NvToolsExt +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) mkdir magma_cuda%CUVER_NODOT% cd magma_cuda%CUVER_NODOT% @@ -35,6 +38,7 @@ cd magma mkdir build && cd build set GPU_TARGET=All +<<<<<<< HEAD if "%CUVER_NODOT%" == "128" ( set CUDA_ARCH_LIST=-gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90 -gencode arch=compute_100,code=sm_100 -gencode arch=compute_120,code=sm_120 ) @@ -44,6 +48,17 @@ if "%CUVER_NODOT:~0,2%" == "12" if NOT "%CUVER_NODOT%" == "128" ( if "%CUVER_NODOT%" == "118" ( set CUDA_ARCH_LIST= -gencode arch=compute_37,code=sm_37 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90 ) +======= +if "%CUVER_NODOT%" == "129" ( + set CUDA_ARCH_LIST=-gencode=arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90 -gencode arch=compute_100,code=sm_100 -gencode arch=compute_120,code=sm_120 +) +if "%CUVER_NODOT%" == "128" ( + set CUDA_ARCH_LIST=-gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90 -gencode arch=compute_100,code=sm_100 -gencode arch=compute_120,code=sm_120 +) +if "%CUVER_NODOT%" == "126" ( + set CUDA_ARCH_LIST=-gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90 +) +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) set CC=cl.exe set CXX=cl.exe @@ -54,7 +69,12 @@ cmake .. -DGPU_TARGET="%GPU_TARGET%" ^ -DCMAKE_BUILD_TYPE=%CONFIG% ^ -DCMAKE_GENERATOR=Ninja ^ -DCMAKE_INSTALL_PREFIX=..\install\ ^ +<<<<<<< HEAD -DCUDA_ARCH_LIST="%CUDA_ARCH_LIST%" +======= + -DCUDA_ARCH_LIST="%CUDA_ARCH_LIST%" ^ + -DCMAKE_POLICY_VERSION_MINIMUM=3.5 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if errorlevel 1 exit /b 1 cmake --build . --target install --config %CONFIG% -- -j%NUMBER_OF_PROCESSORS% diff --git a/.github/templates/common.yml.j2 b/.github/templates/common.yml.j2 index 1a2b282690c1..25905be3c775 100644 --- a/.github/templates/common.yml.j2 +++ b/.github/templates/common.yml.j2 @@ -32,7 +32,11 @@ concurrency: {%- macro setup_ec2_windows() -%} !{{ display_ec2_information() }} - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7 +======= + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/templates/linux_binary_build_workflow.yml.j2 b/.github/templates/linux_binary_build_workflow.yml.j2 index efb415759c95..9bc9a3015051 100644 --- a/.github/templates/linux_binary_build_workflow.yml.j2 +++ b/.github/templates/linux_binary_build_workflow.yml.j2 @@ -29,6 +29,12 @@ on: {%- endfor %} workflow_dispatch: +<<<<<<< HEAD +======= +permissions: + id-token: write + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) env: # Needed for conda builds {%- if "aarch64" in build_environment %} @@ -53,7 +59,11 @@ jobs: get-label-type: if: github.repository_owner == 'pytorch' name: get-label-type +<<<<<<< HEAD uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.7 +======= + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: triggering_actor: ${{ github.triggering_actor }} issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} @@ -111,12 +121,21 @@ jobs: ALPINE_IMAGE: "docker.io/s390x/alpine" {%- elif config["gpu_arch_type"] == "rocm" %} runs_on: linux.rocm.gpu +<<<<<<< HEAD {%- elif config["gpu_arch_type"] == "cuda" and config["gpu_arch_version"] == "12.8" %} runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8 build needs sm_70+ runner {%- elif config["gpu_arch_type"] == "cuda" and config["gpu_arch_version"] != "12.8"%} runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" runs_on: linux.4xlarge.nvidia.gpu +======= + {%- elif config["gpu_arch_type"] == "cuda" and config["gpu_arch_version"] in ["12.8", "12.9"] %} + runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8 and 12.9 build need sm_70+ runner + {%- elif config["gpu_arch_type"] == "cuda" %} + runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + runs_on: linux.4xlarge.nvidia.gpu # for other cuda versions, we use 4xlarge runner +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) {%- else %} runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" runs_on: linux.4xlarge @@ -135,7 +154,11 @@ jobs: uses: ./.github/actions/setup-xpu - name: configure aws credentials id: aws_creds +<<<<<<< HEAD uses: aws-actions/configure-aws-credentials@v1.7.0 +======= + uses: aws-actions/configure-aws-credentials@v4 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only aws-region: us-east-1 @@ -148,6 +171,7 @@ jobs: name: !{{ config["build_name"] }} path: "${{ runner.temp }}/artifacts/" !{{ common.checkout(deep_clone=False, directory="pytorch", checkout_pr_head=False) }} +<<<<<<< HEAD - name: Pull Docker image uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.7 with: @@ -158,6 +182,29 @@ jobs: uses: ./.github/actions/teardown-xpu {%- else %} runs-on: linux.rocm.gpu +======= + - name: Calculate docker image + id: calculate-docker-image + uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.8 + with: + docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }} + docker-image-name: !{{ config["container_image"] }} + custom-tag-prefix: !{{ config["container_image_tag_prefix"] }} + docker-build-dir: .ci/docker + working-directory: pytorch + - name: Pull Docker image + uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.8 + with: + docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} + - name: Test Pytorch binary + uses: ./pytorch/.github/actions/test-pytorch-binary + env: + DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }} + - name: Teardown XPU + uses: ./.github/actions/teardown-xpu + {%- else %} + runs-on: linux.rocm.gpu.mi250 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) timeout-minutes: !{{ common.timeout_minutes }} !{{ upload.binary_env(config) }} steps: @@ -172,12 +219,40 @@ jobs: - name: ROCm set GPU_FLAG run: | echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}" +<<<<<<< HEAD - name: Pull Docker image uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.7 with: docker-image: !{{ config["container_image"] }} - name: Test Pytorch binary uses: ./pytorch/.github/actions/test-pytorch-binary +======= + - name: configure aws credentials + id: aws_creds + if: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') }} + uses: aws-actions/configure-aws-credentials@v4 + with: + role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only + aws-region: us-east-1 + role-duration-seconds: 18000 + - name: Calculate docker image + id: calculate-docker-image + uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.8 + with: + docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }} + docker-image-name: !{{ config["container_image"] }} + custom-tag-prefix: !{{ config["container_image_tag_prefix"] }} + docker-build-dir: .ci/docker + working-directory: pytorch + - name: Pull Docker image + uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.8 + with: + docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} + - name: Test Pytorch binary + uses: ./pytorch/.github/actions/test-pytorch-binary + env: + DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }} +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: Teardown ROCm uses: ./.github/actions/teardown-rocm {%- endif %} diff --git a/.github/templates/upload.yml.j2 b/.github/templates/upload.yml.j2 index 9190ef7deb88..0b3fcb8882cf 100644 --- a/.github/templates/upload.yml.j2 +++ b/.github/templates/upload.yml.j2 @@ -23,11 +23,17 @@ {%- endif %} {%- if not is_windows %} DOCKER_IMAGE: !{{ config["container_image"] }} +<<<<<<< HEAD {%- endif %} {%- if config["package_type"] == "manywheel" %} {%- if config["devtoolset"] %} DESIRED_DEVTOOLSET: !{{ config["devtoolset"] }} {%- endif %} +======= + DOCKER_IMAGE_TAG_PREFIX: !{{ config["container_image_tag_prefix"] }} +{%- endif %} +{%- if config["package_type"] == "manywheel" %} +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) {%- if config.use_split_build is defined %} use_split_build: !{{ config["use_split_build"] }} {%- endif %} @@ -37,9 +43,12 @@ LIBTORCH_CONFIG: !{{ config["libtorch_config"] }} {%- endif %} LIBTORCH_VARIANT: !{{ config["libtorch_variant"] }} +<<<<<<< HEAD {%- if config["devtoolset"] %} DESIRED_DEVTOOLSET: !{{ config["devtoolset"] }} {%- endif %} +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) {%- if is_windows %} # This is a dummy value for libtorch to work correctly with our batch scripts # without this value pip does not get installed for some reason diff --git a/.github/templates/windows_binary_build_workflow.yml.j2 b/.github/templates/windows_binary_build_workflow.yml.j2 index 5bb241b66db9..a47922772eaa 100644 --- a/.github/templates/windows_binary_build_workflow.yml.j2 +++ b/.github/templates/windows_binary_build_workflow.yml.j2 @@ -49,13 +49,29 @@ env: PR_NUMBER: ${{ github.event.pull_request.number }} SHA1: ${{ github.event.pull_request.head.sha || github.sha }} SKIP_ALL_TESTS: 1 +<<<<<<< HEAD +======= + OS: !{{ os }} +{%- if os == "windows-arm64" %} + PYTORCH_ROOT: /pytorch + DOWNLOADS_DIR: c:\temp\downloads + DEPENDENCIES_DIR: c:\temp\dependencies + ENABLE_APL: 1 + ENABLE_OPENBLAS: 0 + MSVC_VERSION : 14.42 +{%- endif %} +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) !{{ common.concurrency(build_environment) }} jobs: get-label-type: if: github.repository_owner == 'pytorch' name: get-label-type +<<<<<<< HEAD uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.7 +======= + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: triggering_actor: ${{ github.triggering_actor }} issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} @@ -66,20 +82,80 @@ jobs: !{{ config["build_name"] }}-build: if: ${{ github.repository_owner == 'pytorch' }} needs: get-label-type +<<<<<<< HEAD +======= + {%- if os == "windows-arm64" %} + runs-on: "windows-11-arm64-preview" + {%- else %} +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) {%- if branches == "nightly" %} runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" {%- else %} runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral" {%- endif %} +<<<<<<< HEAD +======= + {%- endif %} +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) timeout-minutes: !{{ common.timeout_minutes_windows_binary }} !{{ upload.binary_env(config, True) }} {%- if config.pytorch_extra_install_requirements is defined and config.pytorch_extra_install_requirements|d('')|length > 0 %} PYTORCH_EXTRA_INSTALL_REQUIREMENTS: !{{ config.pytorch_extra_install_requirements }} {%- endif %} steps: +<<<<<<< HEAD !{{ common.setup_ec2_windows() }} !{{ set_runner_specific_vars() }} !{{ common.checkout(deep_clone=False, directory="pytorch", checkout_pr_head=False) }} +======= +{%- if os == "windows-arm64" %} + - name: Populate binary env + shell: cmd + run: | + echo BINARY_ENV_FILE=%RUNNER_TEMP%/env>> %GITHUB_ENV% + echo PYTORCH_FINAL_PACKAGE_DIR=%RUNNER_TEMP%/artifacts>> %GITHUB_ENV% + echo WIN_PACKAGE_WORK_DIR=%RUNNER_TEMP%>> %GITHUB_ENV% + - name: Bootstrap folders + shell: cmd + run: | + mkdir "%NIGHTLIES_PYTORCH_ROOT%" + mkdir "%PYTORCH_FINAL_PACKAGE_DIR%" + - name: Enable long paths + shell: cmd + run: | + git config --system --get core.longpaths || echo "core.longpaths is not set, setting it now" + git config --system core.longpaths true + - name: Git checkout PyTorch + uses: actions/checkout@v4 + with: + path: "pytorch" + submodules: recursive + - name: Bootstrap Python + shell: cmd + run: | + "pytorch/.ci/pytorch/windows/arm64/bootstrap_python.bat" + - name: Bootstrap APL + shell: cmd + run: | + "pytorch/.ci/pytorch/windows/arm64/bootstrap_apl.bat" + - name: Bootstrap Rust + shell: cmd + run: | + "pytorch/.ci/pytorch/windows/arm64/bootstrap_rust.bat" + - name: Bootstrap sccache + shell: cmd + run: | + "pytorch/.ci/pytorch/windows/arm64/bootstrap_sccache.bat" + - name: Bootstrap Libuv + shell: cmd + run: | + "pytorch/.ci/pytorch/windows/arm64/bootstrap_libuv.bat" +{%- else %} + !{{ set_runner_specific_vars() }} + !{{ common.setup_ec2_windows() }} + !{{ common.checkout(deep_clone=False, directory="pytorch", checkout_pr_head=False) }} +{%- endif %} +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: Populate binary env shell: bash run: | @@ -95,12 +171,24 @@ jobs: retention-days: 14 if-no-files-found: error path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" +<<<<<<< HEAD + !{{ common.wait_and_kill_ssh_windows('pytorch') }} +======= +{%- if os != "windows-arm64" %} !{{ common.wait_and_kill_ssh_windows('pytorch') }} +{% endif %} +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) !{{ config["build_name"] }}-test: # Testing if: ${{ github.repository_owner == 'pytorch' }} needs: - !{{ config["build_name"] }}-build - get-label-type +<<<<<<< HEAD +======= +{%- if os == "windows-arm64" %} + runs-on: "windows-11-arm64-preview" +{%- else %} +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) {%- if config["gpu_arch_type"] == "cuda" %} {%- if branches == "nightly" %} runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge" @@ -113,18 +201,62 @@ jobs: {%- else %} runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral" {%- endif %} +{%- endif %} +<<<<<<< HEAD + timeout-minutes: !{{ common.timeout_minutes_windows_binary }} + !{{ upload.binary_env(config, True) }} + steps: + !{{ common.setup_ec2_windows() }} + !{{ set_runner_specific_vars() }} +======= {%- endif %} timeout-minutes: !{{ common.timeout_minutes_windows_binary }} !{{ upload.binary_env(config, True) }} steps: +{%- if os == "windows-arm64" %} + - name: Populate binary env + shell: cmd + run: | + echo BINARY_ENV_FILE=%RUNNER_TEMP%/env>> %GITHUB_ENV% + echo PYTORCH_FINAL_PACKAGE_DIR=%RUNNER_TEMP%/artifacts>> %GITHUB_ENV% + echo WIN_PACKAGE_WORK_DIR=%RUNNER_TEMP%>> %GITHUB_ENV% + - name: Enable long paths + shell: cmd + run: | + git config --system --get core.longpaths || echo "core.longpaths is not set, setting it now" + git config --system core.longpaths true + - name: Git checkout PyTorch + uses: actions/checkout@v4 + with: + path: "pytorch" + submodules: recursive + - name: Bootstrap APL + shell: cmd + run: | + "pytorch/.ci/pytorch/windows/arm64/bootstrap_apl.bat" + - name: Bootstrap Python + shell: cmd + run: | + "pytorch/.ci/pytorch/windows/arm64/bootstrap_python.bat" + - name: Bootstrap Rust + shell: cmd + run: | + "pytorch/.ci/pytorch/windows/arm64/bootstrap_rust.bat" +{%- else %} !{{ common.setup_ec2_windows() }} + !{{ common.checkout(deep_clone=False, directory="pytorch", checkout_pr_head=False) }} !{{ set_runner_specific_vars() }} +{%- endif %} +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - uses: !{{ common.download_artifact_action }} name: Download Build Artifacts with: name: !{{ config["build_name"] }} path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" +<<<<<<< HEAD !{{ common.checkout(deep_clone=False, directory="pytorch", checkout_pr_head=False) }} +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: Populate binary env shell: bash run: | @@ -133,8 +265,18 @@ jobs: shell: bash run: | "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh" +<<<<<<< HEAD + !{{ common.wait_and_kill_ssh_windows('pytorch') }} + {%- if branches == "nightly" %} + !{{ upload.upload_binaries(config, True) }} + {%- endif %} +{%- endfor %} +======= +{%- if os != "windows-arm64" %} !{{ common.wait_and_kill_ssh_windows('pytorch') }} +{%- endif %} {%- if branches == "nightly" %} !{{ upload.upload_binaries(config, True) }} {%- endif %} {%- endfor %} +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) diff --git a/.github/workflows/_bazel-build-test.yml b/.github/workflows/_bazel-build-test.yml index 0f7ed87f2a4c..739c82447f93 100644 --- a/.github/workflows/_bazel-build-test.yml +++ b/.github/workflows/_bazel-build-test.yml @@ -47,7 +47,11 @@ jobs: reenabled-issues: ${{ steps.filter.outputs.reenabled-issues }} steps: - name: Checkout PyTorch +<<<<<<< HEAD uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.7 +======= + uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: fetch-depth: 1 submodules: false @@ -69,25 +73,41 @@ jobs: runs-on: ${{ matrix.runner }} steps: - name: Setup SSH (Click me for login details) +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7 +======= + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: github-secret: ${{ secrets.GITHUB_TOKEN }} # [see note: pytorch repo ref] - name: Checkout PyTorch +<<<<<<< HEAD uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.7 +======= + uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: Setup Linux uses: ./.github/actions/setup-linux - name: Calculate docker image id: calculate-docker-image +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.7 +======= + uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: docker-image-name: ${{ inputs.docker-image-name }} - name: Pull docker image +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.7 +======= + uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} @@ -97,7 +117,11 @@ jobs: run: echo "IN_CONTAINER_RUNNER=$(if [ -f /.inarc ] || [ -f /.incontainer ]; then echo true ; else echo false; fi)" >> "$GITHUB_OUTPUT" - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/setup-nvidia@release/2.7 +======= + uses: pytorch/test-infra/.github/actions/setup-nvidia@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if: ${{ inputs.cuda-version != 'cpu' && steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false' }} - name: Output disk space left @@ -209,5 +233,9 @@ jobs: file-suffix: bazel-${{ github.job }}_${{ steps.get-job-id.outputs.job-id }} - name: Teardown Linux +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/teardown-linux@release/2.7 +======= + uses: pytorch/test-infra/.github/actions/teardown-linux@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if: always() diff --git a/.github/workflows/_binary-build-linux.yml b/.github/workflows/_binary-build-linux.yml index eab7c43800bc..c9e930c99348 100644 --- a/.github/workflows/_binary-build-linux.yml +++ b/.github/workflows/_binary-build-linux.yml @@ -23,7 +23,11 @@ on: description: Hardware to run this "build" job on, linux.12xlarge or linux.arm64.2xlarge. timeout-minutes: required: false +<<<<<<< HEAD default: 210 +======= + default: 240 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) type: number description: timeout for the job use_split_build: @@ -62,6 +66,13 @@ on: required: true type: string description: Docker image to use +<<<<<<< HEAD +======= + DOCKER_IMAGE_TAG_PREFIX: + required: true + type: string + description: Docker image tag to use +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) LIBTORCH_CONFIG: required: false type: string @@ -70,10 +81,13 @@ on: required: false type: string description: Desired libtorch variant (for libtorch builds only) +<<<<<<< HEAD DESIRED_DEVTOOLSET: required: false type: string description: Desired dev toolset +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) DESIRED_PYTHON: required: false type: string @@ -88,6 +102,12 @@ on: required: true description: Github Token +<<<<<<< HEAD +======= +permissions: + id-token: write + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) jobs: build: runs-on: ${{ inputs.runner_prefix}}${{ inputs.runs_on }} @@ -104,7 +124,10 @@ jobs: SKIP_ALL_TESTS: 1 LIBTORCH_CONFIG: ${{ inputs.LIBTORCH_CONFIG }} LIBTORCH_VARIANT: ${{ inputs.LIBTORCH_VARIANT }} +<<<<<<< HEAD DESIRED_DEVTOOLSET: ${{ inputs.DESIRED_DEVTOOLSET }} +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) DESIRED_PYTHON: ${{ inputs.DESIRED_PYTHON }} PYTORCH_EXTRA_INSTALL_REQUIREMENTS: ${{ inputs.PYTORCH_EXTRA_INSTALL_REQUIREMENTS }} ALPINE_IMAGE: ${{ inputs.ALPINE_IMAGE }} @@ -130,7 +153,10 @@ jobs: echo "SKIP_ALL_TESTS=${{ env.SKIP_ALL_TESTS }}" echo "LIBTORCH_CONFIG=${{ env.LIBTORCH_CONFIG }}" echo "LIBTORCH_VARIANT=${{ env.LIBTORCH_VARIANT }}" +<<<<<<< HEAD echo "DESIRED_DEVTOOLSET=${{ env.DESIRED_DEVTOOLSET }}" +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) echo "DESIRED_PYTHON=${{ env.DESIRED_PYTHON }}" echo "PYTORCH_EXTRA_INSTALL_REQUIREMENTS=${{ env.PYTORCH_EXTRA_INSTALL_REQUIREMENTS }}" echo "ALPINE_IMAGE=${{ env.ALPINE_IMAGE }}" @@ -150,13 +176,21 @@ jobs: - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" if: inputs.build_environment != 'linux-s390x-binary-manywheel' +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7 +======= + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) continue-on-error: true with: github-secret: ${{ secrets.github-token }} - name: Checkout PyTorch +<<<<<<< HEAD uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.7 +======= + uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: no-sudo: ${{ inputs.build_environment == 'linux-aarch64-binary-manywheel' || inputs.build_environment == 'linux-s390x-binary-manywheel' }} @@ -184,7 +218,11 @@ jobs: fi - name: Checkout PyTorch to pytorch dir +<<<<<<< HEAD uses: actions/checkout@v4 +======= + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: submodules: recursive path: pytorch @@ -208,6 +246,7 @@ jobs: { config: "default" }, ]} +<<<<<<< HEAD - name: Pull Docker image if: ${{ steps.filter.outputs.is-test-matrix-empty == 'False' && inputs.build_environment != 'linux-s390x-binary-manywheel' }} uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.7 @@ -216,6 +255,42 @@ jobs: - name: Build PyTorch binary if: ${{ steps.filter.outputs.is-test-matrix-empty == 'False' }} +======= + - name: configure aws credentials + id: aws_creds + if: ${{ steps.filter.outputs.is-test-matrix-empty == 'False' && inputs.build_environment != 'linux-s390x-binary-manywheel' && startsWith(github.event.ref, 'refs/tags/ciflow/') }} + uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0 + with: + role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only + aws-region: us-east-1 + role-duration-seconds: 18000 + + - name: Calculate docker image + id: calculate-docker-image + if: ${{ steps.filter.outputs.is-test-matrix-empty == 'False' && inputs.build_environment != 'linux-s390x-binary-manywheel' }} + uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.8 + with: + # If doing this in release/2.8 or release branch, use docker.io. Otherwise + # use ECR + docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }} + docker-image-name: ${{ inputs.DOCKER_IMAGE }} + custom-tag-prefix: ${{ inputs.DOCKER_IMAGE_TAG_PREFIX }} + # The build.sh script in this folder is not actually the correct one, + # this is just needed for sha calculation + docker-build-dir: .ci/docker + working-directory: pytorch + + - name: Pull Docker image + if: ${{ steps.filter.outputs.is-test-matrix-empty == 'False' && inputs.build_environment != 'linux-s390x-binary-manywheel' }} + uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.8 + with: + docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} + + - name: Build PyTorch binary + if: ${{ steps.filter.outputs.is-test-matrix-empty == 'False' }} + env: + DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image || format('{0}:{1}', inputs.DOCKER_IMAGE, inputs.DOCKER_IMAGE_TAG_PREFIX) }} +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) run: | set -x mkdir -p artifacts/ @@ -223,7 +298,10 @@ jobs: -e BINARY_ENV_FILE \ -e BUILD_ENVIRONMENT \ -e DESIRED_CUDA \ +<<<<<<< HEAD -e DESIRED_DEVTOOLSET \ +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) -e DESIRED_PYTHON \ -e GITHUB_ACTIONS \ -e GPU_ARCH_TYPE \ @@ -256,7 +334,11 @@ jobs: # Ensure the working directory gets chowned back to the current user docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . +<<<<<<< HEAD - uses: actions/upload-artifact@v4.4.0 +======= + - uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if: ${{ steps.filter.outputs.is-test-matrix-empty == 'False' }} with: name: ${{ inputs.build_name }} @@ -266,7 +348,11 @@ jobs: - name: Teardown Linux if: always() && inputs.build_environment != 'linux-s390x-binary-manywheel' +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/teardown-linux@release/2.7 +======= + uses: pytorch/test-infra/.github/actions/teardown-linux@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: Chown workspace if: always() && inputs.build_environment != 'linux-s390x-binary-manywheel' diff --git a/.github/workflows/_binary-test-linux.yml b/.github/workflows/_binary-test-linux.yml index 153f1e6d2f1a..23f7fa91a51c 100644 --- a/.github/workflows/_binary-test-linux.yml +++ b/.github/workflows/_binary-test-linux.yml @@ -39,6 +39,13 @@ on: required: true type: string description: Docker image to use +<<<<<<< HEAD +======= + DOCKER_IMAGE_TAG_PREFIX: + required: true + type: string + description: Docker image tag to use +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) LIBTORCH_CONFIG: required: false type: string @@ -47,10 +54,13 @@ on: required: false type: string description: Desired libtorch variant (for libtorch builds only) +<<<<<<< HEAD DESIRED_DEVTOOLSET: required: false type: string description: Desired dev toolset +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) DESIRED_PYTHON: required: false type: string @@ -76,6 +86,12 @@ on: required: true description: Github Token +<<<<<<< HEAD +======= +permissions: + id-token: write + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) jobs: test: runs-on: ${{ inputs.runner_prefix}}${{ inputs.runs_on }} @@ -92,7 +108,10 @@ jobs: SKIP_ALL_TESTS: 1 LIBTORCH_CONFIG: ${{ inputs.LIBTORCH_CONFIG }} LIBTORCH_VARIANT: ${{ inputs.LIBTORCH_VARIANT }} +<<<<<<< HEAD DESIRED_DEVTOOLSET: ${{ inputs.DESIRED_DEVTOOLSET }} +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) DESIRED_PYTHON: ${{ inputs.DESIRED_PYTHON }} ALPINE_IMAGE: ${{ inputs.ALPINE_IMAGE }} AWS_DEFAULT_REGION: us-east-1 @@ -118,7 +137,10 @@ jobs: echo "SKIP_ALL_TESTS=${{ env.SKIP_ALL_TESTS }}" echo "LIBTORCH_CONFIG=${{ env.LIBTORCH_CONFIG }}" echo "LIBTORCH_VARIANT=${{ env.LIBTORCH_VARIANT }}" +<<<<<<< HEAD echo "DESIRED_DEVTOOLSET=${{ env.DESIRED_DEVTOOLSET }}" +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) echo "DESIRED_PYTHON=${{ env.DESIRED_PYTHON }}" echo "ALPINE_IMAGE=${{ env.ALPINE_IMAGE }}" @@ -133,14 +155,22 @@ jobs: - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" if: inputs.build_environment != 'linux-s390x-binary-manywheel' +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7 +======= + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) continue-on-error: true with: github-secret: ${{ secrets.github-token }} # Setup the environment - name: Checkout PyTorch +<<<<<<< HEAD uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.7 +======= + uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: no-sudo: ${{ inputs.build_environment == 'linux-aarch64-binary-manywheel' || inputs.build_environment == 'linux-s390x-binary-manywheel' }} @@ -161,7 +191,11 @@ jobs: mkdir "${GITHUB_WORKSPACE}" - name: Checkout PyTorch to pytorch dir +<<<<<<< HEAD uses: actions/checkout@v4 +======= + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: submodules: recursive show-progress: false @@ -187,12 +221,17 @@ jobs: - name: Download Build Artifacts if: ${{ steps.filter.outputs.is-test-matrix-empty == 'False' }} +<<<<<<< HEAD uses: actions/download-artifact@v4.1.7 +======= + uses: actions/download-artifact@65a9edc5881444af0b9093a5e628f2fe47ea3b2e # v4.1.7 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: name: ${{ inputs.build_name }} path: "${{ runner.temp }}/artifacts/" - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/setup-nvidia@release/2.7 if: ${{ inputs.GPU_ARCH_TYPE == 'cuda' && steps.filter.outputs.is-test-matrix-empty == 'False' }} @@ -201,14 +240,53 @@ jobs: uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.7 with: docker-image: ${{ inputs.DOCKER_IMAGE }} +======= + uses: pytorch/test-infra/.github/actions/setup-nvidia@release/2.8 + if: ${{ inputs.GPU_ARCH_TYPE == 'cuda' && steps.filter.outputs.is-test-matrix-empty == 'False' }} + + - name: configure aws credentials + id: aws_creds + if: ${{ steps.filter.outputs.is-test-matrix-empty == 'False' && inputs.build_environment != 'linux-s390x-binary-manywheel' && startsWith(github.event.ref, 'refs/tags/ciflow/') }} + uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0 + with: + role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only + aws-region: us-east-1 + role-duration-seconds: 18000 + + - name: Calculate docker image + id: calculate-docker-image + if: ${{ steps.filter.outputs.is-test-matrix-empty == 'False' && inputs.build_environment != 'linux-s390x-binary-manywheel' }} + uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.8 + with: + docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }} + docker-image-name: ${{ inputs.DOCKER_IMAGE }} + custom-tag-prefix: ${{ inputs.DOCKER_IMAGE_TAG_PREFIX }} + docker-build-dir: .ci/docker + working-directory: pytorch + + - name: Pull Docker image + if: ${{ steps.filter.outputs.is-test-matrix-empty == 'False' && inputs.build_environment != 'linux-s390x-binary-manywheel' }} + uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.8 + with: + docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: Test Pytorch binary if: ${{ steps.filter.outputs.is-test-matrix-empty == 'False' }} uses: ./pytorch/.github/actions/test-pytorch-binary +<<<<<<< HEAD - name: Teardown Linux if: always() && inputs.build_environment != 'linux-s390x-binary-manywheel' uses: pytorch/test-infra/.github/actions/teardown-linux@release/2.7 +======= + env: + DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image || format('{0}:{1}', inputs.DOCKER_IMAGE, inputs.DOCKER_IMAGE_TAG_PREFIX) }} + + - name: Teardown Linux + if: always() && inputs.build_environment != 'linux-s390x-binary-manywheel' + uses: pytorch/test-infra/.github/actions/teardown-linux@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: Chown workspace if: always() && inputs.build_environment != 'linux-s390x-binary-manywheel' diff --git a/.github/workflows/_binary-upload.yml b/.github/workflows/_binary-upload.yml index 296ac999c8c2..828441619ab9 100644 --- a/.github/workflows/_binary-upload.yml +++ b/.github/workflows/_binary-upload.yml @@ -35,6 +35,13 @@ on: required: false type: string description: Docker image to use +<<<<<<< HEAD +======= + DOCKER_IMAGE_TAG_PREFIX: + required: false + type: string + description: Docker image tag to use +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) LIBTORCH_CONFIG: required: false type: string @@ -43,10 +50,13 @@ on: required: false type: string description: Desired libtorch variant (for libtorch builds only) +<<<<<<< HEAD DESIRED_DEVTOOLSET: required: false type: string description: Desired dev toolset +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) DESIRED_PYTHON: required: false type: string @@ -80,7 +90,10 @@ jobs: SKIP_ALL_TESTS: 1 LIBTORCH_CONFIG: ${{ inputs.LIBTORCH_CONFIG }} LIBTORCH_VARIANT: ${{ inputs.LIBTORCH_VARIANT }} +<<<<<<< HEAD DESIRED_DEVTOOLSET: ${{ inputs.DESIRED_DEVTOOLSET }} +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) DESIRED_PYTHON: ${{ inputs.DESIRED_PYTHON }} BINARY_ENV_FILE: /tmp/env GITHUB_TOKEN: ${{ secrets.github-token }} @@ -90,20 +103,32 @@ jobs: USE_SPLIT_BUILD: ${{ inputs.use_split_build }} steps: - name: Checkout PyTorch +<<<<<<< HEAD uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.7 +======= + uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: no-sudo: true - name: Configure AWS credentials(PyTorch account) for nightly if: ${{ github.event_name == 'push' && github.event.ref == 'refs/heads/nightly' }} +<<<<<<< HEAD uses: aws-actions/configure-aws-credentials@v3 +======= + uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: role-to-assume: arn:aws:iam::749337293305:role/gha_workflow_nightly_build_wheels aws-region: us-east-1 - name: Configure AWS credentials(PyTorch account) for RC builds if: ${{ github.event_name == 'push' && (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/')) }} +<<<<<<< HEAD uses: aws-actions/configure-aws-credentials@v3 +======= + uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: role-to-assume: arn:aws:iam::749337293305:role/gha_workflow_test_build_wheels aws-region: us-east-1 @@ -113,7 +138,11 @@ jobs: # NB: When the previous build job is skipped, there won't be any artifacts and # this step will fail. Binary build jobs can only be skipped on CI, not nightly continue-on-error: true +<<<<<<< HEAD uses: actions/download-artifact@v4.1.7 +======= + uses: actions/download-artifact@65a9edc5881444af0b9093a5e628f2fe47ea3b2e # v4.1.7 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: name: ${{ inputs.build_name }} path: "${{ runner.temp }}/artifacts/" diff --git a/.github/workflows/_docs.yml b/.github/workflows/_docs.yml index cf1788a2d78a..560cb6a166b9 100644 --- a/.github/workflows/_docs.yml +++ b/.github/workflows/_docs.yml @@ -84,7 +84,11 @@ jobs: name: build-docs-${{ matrix.docs_type }}-${{ inputs.push }} steps: - name: Setup SSH (Click me for login details) +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7 +======= + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: github-secret: ${{ secrets.GITHUB_TOKEN }} instructions: | @@ -95,14 +99,22 @@ jobs: # [see note: pytorch repo ref] - name: Checkout PyTorch +<<<<<<< HEAD uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.7 +======= + uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: Setup Linux uses: ./.github/actions/setup-linux - name: configure aws credentials if : ${{ inputs.aws-role-to-assume != '' }} +<<<<<<< HEAD uses: aws-actions/configure-aws-credentials@v3 +======= + uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: role-to-assume: ${{ inputs.aws-role-to-assume }} role-session-name: gha-linux-test @@ -110,12 +122,20 @@ jobs: - name: Calculate docker image id: calculate-docker-image +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.7 +======= + uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: docker-image-name: ${{ inputs.docker-image }} - name: Pull docker image +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.7 +======= + uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} @@ -185,14 +205,22 @@ jobs: - name: configure aws credentials if : ${{ inputs.upload-aws-role-to-assume != '' }} +<<<<<<< HEAD uses: aws-actions/configure-aws-credentials@v3 +======= + uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: role-to-assume: ${{ inputs.upload-aws-role-to-assume }} role-session-name: gha-linux-test aws-region: us-east-1 - name: Upload Python Docs Preview +<<<<<<< HEAD uses: seemethere/upload-artifact-s3@v5 +======= + uses: seemethere/upload-artifact-s3@baba72d0712b404f646cebe0730933554ebce96a # v5.1.0 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if: ${{ github.event_name == 'pull_request' && matrix.docs_type == 'python' && steps.build-docs.outcome == 'success' }} with: retention-days: 14 @@ -202,7 +230,11 @@ jobs: s3-prefix: pytorch/pytorch/${{ github.event.pull_request.number }} - name: Upload C++ Docs Preview +<<<<<<< HEAD uses: seemethere/upload-artifact-s3@v5 +======= + uses: seemethere/upload-artifact-s3@baba72d0712b404f646cebe0730933554ebce96a # v5.1.0 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if: ${{ github.event_name == 'pull_request' && matrix.docs_type == 'cpp' && steps.build-docs.outcome == 'success' }} with: retention-days: 14 @@ -212,7 +244,11 @@ jobs: s3-prefix: pytorch/pytorch/${{ github.event.pull_request.number }}/cppdocs - name: Upload functorch Docs Preview +<<<<<<< HEAD uses: seemethere/upload-artifact-s3@v5 +======= + uses: seemethere/upload-artifact-s3@baba72d0712b404f646cebe0730933554ebce96a # v5.1.0 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if: ${{ github.event_name == 'pull_request' && matrix.docs_type == 'functorch' && steps.build-docs.outcome == 'success' }} with: retention-days: 14 @@ -222,5 +258,9 @@ jobs: s3-prefix: pytorch/pytorch/${{ github.event.pull_request.number }}/functorchdocs - name: Teardown Linux +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/teardown-linux@release/2.7 +======= + uses: pytorch/test-infra/.github/actions/teardown-linux@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if: always() diff --git a/.github/workflows/_link_check.yml b/.github/workflows/_link_check.yml new file mode 100644 index 000000000000..bfe554decf53 --- /dev/null +++ b/.github/workflows/_link_check.yml @@ -0,0 +1,60 @@ +on: + workflow_call: + inputs: + runner: + type: string + required: true + ref: + type: string + required: true + +jobs: + lint-urls: + if: ${{ github.event_name != 'pull_request' || !contains(github.event.pull_request.labels.*.name, 'skip-url-lint') }} + uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@release/2.8 + with: + timeout: 120 + runner: ${{ inputs.runner }}linux.2xlarge + docker-image: ci-image:pytorch-linux-jammy-linter + fetch-depth: 0 + submodules: false + ref: ${{ inputs.ref }} + script: | + ./scripts/lint_urls.sh $( + if [ "${{ github.event_name }}" = "pull_request" ]; then + echo "${{ github.event.pull_request.base.sha }}" "${{ github.event.pull_request.head.sha }}" + else + echo "${{ github.event.before }}" "${{ github.sha }}" + fi + ) || { + echo + echo "URL lint failed." + echo "If this is a transient outage, you can bypass it by adding the \`skip-url-lint\` label to your PR." + echo "Or add \`@lint-ignore\` somewhere on the same line as the URL you want to skip checking." + exit 1 + } + + lint-xrefs: + if: ${{ github.event_name != 'pull_request' || !contains(github.event.pull_request.labels.*.name, 'skip-xref-lint') }} + uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@release/2.8 + with: + timeout: 60 + runner: ${{ inputs.runner }}linux.2xlarge + docker-image: ci-image:pytorch-linux-jammy-linter + fetch-depth: 0 + submodules: false + ref: ${{ inputs.ref }} + script: | + ./scripts/lint_xrefs.sh $( + if [ "${{ github.event_name }}" = "pull_request" ]; then + echo "${{ github.event.pull_request.base.sha }}" "${{ github.event.pull_request.head.sha }}" + else + echo "${{ github.event.before }}" "${{ github.sha }}" + fi + ) || { + echo + echo "Xref lint failed." + echo "If this is a transient outage, you can bypass it by adding the \`skip-xref-lint\` label to your PR." + echo "Or add \`@lint-ignore\` somewhere on the same line as the reference you want to skip checking." + exit 1 + } diff --git a/.github/workflows/_linux-build.yml b/.github/workflows/_linux-build.yml index 7426b62428a9..0df563f49682 100644 --- a/.github/workflows/_linux-build.yml +++ b/.github/workflows/_linux-build.yml @@ -74,6 +74,35 @@ on: Overwrite the number of jobs to use for the build required: false type: string +<<<<<<< HEAD +======= + disable-monitor: + description: | + Disable utilization monitoring for build job + required: false + type: boolean + default: false + monitor-log-interval: + description: | + Set the interval for the monitor script to log utilization. + required: false + type: number + default: 5 + monitor-data-collect-interval: + description: | + Set the interval for the monitor script to collect data. + required: false + type: number + default: 1 + + allow-reuse-old-whl: + description: | + If set, the build try to pull an old wheel from s3 that was built on a + commit with no cpp changes from this commit + required: false + type: boolean + default: true +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) secrets: HUGGING_FACE_HUB_TOKEN: @@ -106,7 +135,11 @@ jobs: test-matrix: ${{ steps.filter.outputs.test-matrix }} steps: - name: Setup SSH (Click me for login details) +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7 +======= + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if: inputs.build-environment != 'linux-s390x-binary-manywheel' with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -116,7 +149,11 @@ jobs: # checkout because when we run this action we don't *have* a local # checkout. In other cases you should prefer a local checkout. - name: Checkout PyTorch +<<<<<<< HEAD uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.7 +======= + uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: no-sudo: true @@ -125,13 +162,18 @@ jobs: if: inputs.build-environment != 'linux-s390x-binary-manywheel' - name: configure aws credentials +<<<<<<< HEAD uses: aws-actions/configure-aws-credentials@v3 +======= + uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if: ${{ inputs.aws-role-to-assume != '' && inputs.build-environment != 'linux-s390x-binary-manywheel' }} with: role-to-assume: ${{ inputs.aws-role-to-assume }} role-session-name: gha-linux-build aws-region: us-east-1 +<<<<<<< HEAD - name: Calculate docker image id: calculate-docker-image uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.7 @@ -159,6 +201,8 @@ jobs: id: parse-ref run: .github/scripts/parse_ref.py +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: Get workflow job id id: get-job-id uses: ./.github/actions/get-workflow-job-id @@ -166,6 +210,47 @@ jobs: with: github-token: ${{ secrets.GITHUB_TOKEN }} +<<<<<<< HEAD +======= + - name: Check if can use old whl build + id: use-old-whl + uses: ./.github/actions/reuse-old-whl + if: ${{ inputs.allow-reuse-old-whl }} + with: + build-environment: ${{ inputs.build-environment }} + run-id: ${{ github.run_id }} + github-token: ${{ secrets.GITHUB_TOKEN }} + job-id: ${{ steps.get-job-id.outputs.job-id }} + job-name: ${{ steps.get-job-id.outputs.job-name }} + + - name: Calculate docker image + id: calculate-docker-image + uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.8 + if: inputs.build-environment != 'linux-s390x-binary-manywheel' + with: + docker-image-name: ${{ inputs.docker-image-name }} + + - name: Use following to pull public copy of the image + id: print-ghcr-mirror + if: inputs.build-environment != 'linux-s390x-binary-manywheel' && steps.use-old-whl.outputs.reuse != 'true' + env: + ECR_DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }} + shell: bash + run: | + tag=${ECR_DOCKER_IMAGE##*:} + echo "docker pull ghcr.io/pytorch/ci-image:${tag/:/-}" + + - name: Pull docker image + uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.8 + if: inputs.build-environment != 'linux-s390x-binary-manywheel' && steps.use-old-whl.outputs.reuse != 'true' + with: + docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} + + - name: Parse ref + id: parse-ref + run: .github/scripts/parse_ref.py + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # Apply the filter logic to the build step too if the test-config label is already there - name: Select all requested test configurations (if the test matrix is available) id: filter @@ -176,17 +261,49 @@ jobs: selected-test-configs: ${{ inputs.selected-test-configs }} job-name: ${{ steps.get-job-id.outputs.job-name }} +<<<<<<< HEAD - name: Download pytest cache uses: ./.github/actions/pytest-cache-download continue-on-error: true if: inputs.build-environment != 'linux-s390x-binary-manywheel' +======= + - name: Start monitoring script + id: monitor-script + if: ${{ !inputs.disable-monitor }} + shell: bash + continue-on-error: true + env: + JOB_ID: ${{ steps.get-job-id.outputs.job-id }} + JOB_NAME: ${{ steps.get-job-id.outputs.job-name }} + WORKFLOW_NAME: ${{ github.workflow }} + WORKFLOW_RUN_ID: ${{github.run_id}} + MONITOR_LOG_INTERVAL: ${{ inputs.monitor-log-interval }} + MONITOR_DATA_COLLECT_INTERVAL: ${{ inputs.monitor-data-collect-interval }} + run: | + mkdir -p ../../usage_logs + python3 -m pip install psutil==5.9.1 dataclasses_json==0.6.7 + python3 -m tools.stats.monitor \ + --log-interval "$MONITOR_LOG_INTERVAL" \ + --data-collect-interval "$MONITOR_DATA_COLLECT_INTERVAL" \ + > "../../usage_logs/usage_log_build_${JOB_ID}.txt" 2>&1 & + echo "monitor-script-pid=${!}" >> "${GITHUB_OUTPUT}" + + - name: Download pytest cache + uses: ./.github/actions/pytest-cache-download + continue-on-error: true + if: inputs.build-environment != 'linux-s390x-binary-manywheel' && steps.use-old-whl.outputs.reuse != 'true' +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: cache_dir: .pytest_cache job_identifier: ${{ github.workflow }}_${{ inputs.build-environment }} s3_bucket: ${{ inputs.s3-bucket }} - name: Build +<<<<<<< HEAD if: steps.filter.outputs.is-test-matrix-empty == 'False' || inputs.test-matrix == '' +======= + if: (steps.filter.outputs.is-test-matrix-empty == 'False' || inputs.test-matrix == '') && steps.use-old-whl.outputs.reuse != 'true' +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) id: build env: BUILD_ENVIRONMENT: ${{ inputs.build-environment }} @@ -280,14 +397,33 @@ jobs: END_TIME=$(date +%s) echo "build_time=$((END_TIME - START_TIME))" >> "$GITHUB_OUTPUT" +<<<<<<< HEAD - name: Archive artifacts into zip if: inputs.build-generates-artifacts && steps.build.outcome != 'skipped' +======= + - name: Stop monitoring script + if: ${{ always() && steps.monitor-script.outputs.monitor-script-pid }} + shell: bash + continue-on-error: true + env: + MONITOR_SCRIPT_PID: ${{ steps.monitor-script.outputs.monitor-script-pid }} + run: | + kill "$MONITOR_SCRIPT_PID" + + - name: Archive artifacts into zip + if: inputs.build-generates-artifacts && steps.build.outcome != 'skipped' && steps.use-old-whl.outputs.reuse != 'true' +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) run: | zip -1 -r artifacts.zip dist/ build/custom_test_artifacts build/lib build/bin .additional_ci_files - name: Store PyTorch Build Artifacts on S3 +<<<<<<< HEAD uses: seemethere/upload-artifact-s3@v5 if: inputs.build-generates-artifacts && steps.build.outcome != 'skipped' && inputs.build-environment != 'linux-s390x-binary-manywheel' +======= + uses: seemethere/upload-artifact-s3@baba72d0712b404f646cebe0730933554ebce96a # v5.1.0 + if: inputs.build-generates-artifacts && (steps.build.outcome != 'skipped' || steps.use-old-whl.outputs.reuse == 'true') && inputs.build-environment != 'linux-s390x-binary-manywheel' +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: name: ${{ inputs.build-environment }} retention-days: 14 @@ -296,14 +432,41 @@ jobs: s3-bucket: ${{ inputs.s3-bucket }} - name: Store PyTorch Build Artifacts for s390x +<<<<<<< HEAD uses: actions/upload-artifact@v4 if: inputs.build-generates-artifacts && steps.build.outcome != 'skipped' && inputs.build-environment == 'linux-s390x-binary-manywheel' +======= + uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2 + if: inputs.build-generates-artifacts && (steps.build.outcome != 'skipped' || steps.use-old-whl.outputs.reuse == 'true') && inputs.build-environment == 'linux-s390x-binary-manywheel' +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: name: ${{ inputs.build-environment }} retention-days: 14 if-no-files-found: error path: artifacts.zip +<<<<<<< HEAD +======= + - name: copy logs + shell: bash + if: ${{ always() && steps.build.outcome != 'skipped' && !inputs.disable-monitor && inputs.build-environment != 'linux-s390x-binary-manywheel'}} + continue-on-error: true + run: | + rm -f ./usage_logs + mkdir -p ./usage_logs + cp ../../usage_logs/usage_log_build_*.txt ./usage_logs/ + + - name: Upload raw usage log to s3 + if: ${{ always() && steps.build.outcome != 'skipped' && !inputs.disable-monitor && inputs.build-environment != 'linux-s390x-binary-manywheel'}} + uses: seemethere/upload-artifact-s3@v5 + with: + s3-prefix: | + ${{ github.repository }}/${{ github.run_id }}/${{ github.run_attempt }}/artifact + retention-days: 14 + if-no-files-found: warn + path: usage_logs/usage_log_build_*.txt + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: Upload sccache stats if: steps.build.outcome != 'skipped' && inputs.build-environment != 'linux-s390x-binary-manywheel' uses: ./.github/actions/upload-sccache-stats @@ -311,8 +474,25 @@ jobs: github-token: ${{ secrets.GITHUB_TOKEN }} build-time: ${{ steps.build.outputs.build_time }} +<<<<<<< HEAD - name: Teardown Linux uses: pytorch/test-infra/.github/actions/teardown-linux@release/2.7 +======= + - name: Upload utilization stats + if: ${{ always() && steps.build.outcome != 'skipped' && !inputs.disable-monitor && inputs.build-environment != 'linux-s390x-binary-manywheel' }} + continue-on-error: true + uses: ./.github/actions/upload-utilization-stats + with: + job_id: ${{ steps.get-job-id.outputs.job-id }} + job_name: ${{ steps.get-job-id.outputs.job-name }} + workflow_name: ${{ github.workflow }} + workflow_run_id: ${{github.run_id}} + workflow_attempt: ${{github.run_attempt}} + artifact_prefix: usage_log_build_${{ steps.get-job-id.outputs.job-id }} + + - name: Teardown Linux + uses: pytorch/test-infra/.github/actions/teardown-linux@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if: always() && inputs.build-environment != 'linux-s390x-binary-manywheel' - name: Cleanup docker diff --git a/.github/workflows/_linux-test.yml b/.github/workflows/_linux-test.yml index 389a65a782c8..44d09f39d94d 100644 --- a/.github/workflows/_linux-test.yml +++ b/.github/workflows/_linux-test.yml @@ -55,6 +55,21 @@ on: required: false type: boolean default: false +<<<<<<< HEAD +======= + monitor-log-interval: + description: | + Set the interval for the monitor script to log utilization. + required: false + type: number + default: 5 + monitor-data-collect-interval: + description: | + Set the interval for the monitor script to collect data. + required: false + type: number + default: 1 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) secrets: HUGGING_FACE_HUB_TOKEN: required: false @@ -80,7 +95,11 @@ jobs: timeout-minutes: ${{ matrix.mem_leak_check == 'mem_leak_check' && 600 || inputs.timeout-minutes }} steps: - name: Setup SSH (Click me for login details) +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7 +======= + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if: ${{ !contains(matrix.runner, 'gcp.a100') && inputs.build-environment != 'linux-s390x-binary-manywheel' }} with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -89,7 +108,11 @@ jobs: docker exec -it $(docker container ps --format '{{.ID}}') bash - name: Checkout PyTorch +<<<<<<< HEAD uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.7 +======= + uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: no-sudo: true @@ -99,7 +122,11 @@ jobs: - name: configure aws credentials if : ${{ inputs.aws-role-to-assume != '' && inputs.build-environment != 'linux-s390x-binary-manywheel' }} +<<<<<<< HEAD uses: aws-actions/configure-aws-credentials@v3 +======= + uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: role-to-assume: ${{ inputs.aws-role-to-assume }} role-session-name: gha-linux-test @@ -107,7 +134,11 @@ jobs: - name: Calculate docker image id: calculate-docker-image +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.7 +======= + uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if: inputs.build-environment != 'linux-s390x-binary-manywheel' with: docker-image-name: ${{ inputs.docker-image }} @@ -119,11 +150,19 @@ jobs: ECR_DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }} shell: bash run: | +<<<<<<< HEAD tag=${ECR_DOCKER_IMAGE##*/} echo "docker pull ghcr.io/pytorch/ci-image:${tag/:/-}" - name: Pull docker image uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.7 +======= + tag=${ECR_DOCKER_IMAGE##*:} + echo "docker pull ghcr.io/pytorch/ci-image:${tag/:/-}" + + - name: Pull docker image + uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if: inputs.build-environment != 'linux-s390x-binary-manywheel' with: docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} @@ -135,7 +174,11 @@ jobs: - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG id: install-nvidia-driver +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/setup-nvidia@release/2.7 +======= + uses: pytorch/test-infra/.github/actions/setup-nvidia@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if: ${{ contains(inputs.build-environment, 'cuda') && !contains(matrix.config, 'nogpu') && steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false' }} - name: Setup GPU_FLAG for docker run @@ -172,9 +215,17 @@ jobs: JOB_NAME: ${{ steps.get-job-id.outputs.job-name }} WORKFLOW_NAME: ${{ github.workflow }} WORKFLOW_RUN_ID: ${{github.run_id}} +<<<<<<< HEAD run: | python3 -m pip install psutil==5.9.1 nvidia-ml-py==11.525.84 dataclasses_json==0.6.7 python3 -m tools.stats.monitor > usage_log.txt 2>&1 & +======= + MONITOR_LOG_INTERVAL: ${{ inputs.monitor-log-interval }} + MONITOR_DATA_COLLECT_INTERVAL: ${{ inputs.monitor-data-collect-interval }} + run: | + python3 -m pip install psutil==5.9.1 dataclasses_json==0.6.7 nvidia-ml-py==11.525.84 + python3 -m tools.stats.monitor --log-interval "$MONITOR_LOG_INTERVAL" --data-collect-interval "$MONITOR_DATA_COLLECT_INTERVAL" > usage_log.txt 2>&1 & +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) echo "monitor-script-pid=${!}" >> "${GITHUB_OUTPUT}" - name: Download build artifacts @@ -193,7 +244,11 @@ jobs: run: .github/scripts/parse_ref.py - name: Check for keep-going label and re-enabled test issues +<<<<<<< HEAD # This uses the filter-test-configs action because it conviniently +======= + # This uses the filter-test-configs action because it conveniently +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # checks for labels and re-enabled test issues. It does not actually do # any filtering. All filtering is done in the build step. id: keep-going @@ -362,7 +417,11 @@ jobs: - name: Upload pytest cache if tests failed uses: ./.github/actions/pytest-cache-upload continue-on-error: true +<<<<<<< HEAD if: failure() && steps.test.conclusion && steps.test.conclusion == 'failure' +======= + if: failure() && steps.test.conclusion && steps.test.conclusion == 'failure' && inputs.build-environment != 'linux-s390x-binary-manywheel' +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: cache_dir: .pytest_cache shard: ${{ matrix.shard }} @@ -371,7 +430,12 @@ jobs: job_identifier: ${{ github.workflow }}_${{ inputs.build-environment }} - name: Upload the benchmark results +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/upload-benchmark-results@release/2.7 +======= + uses: pytorch/test-infra/.github/actions/upload-benchmark-results@release/2.8 + if: inputs.build-environment != 'linux-s390x-binary-manywheel' +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: benchmark-results-dir: test/test-reports dry-run: false @@ -408,7 +472,11 @@ jobs: find . -iname "core.[1-9]*" -exec docker exec "${DOCKER_CONTAINER_ID}" sh -c "gdb python {} -ex 'bt' -ex 'q'" \; - name: Store Core dumps on S3 +<<<<<<< HEAD uses: seemethere/upload-artifact-s3@v5 +======= + uses: seemethere/upload-artifact-s3@baba72d0712b404f646cebe0730933554ebce96a # v5.1.0 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if: failure() with: name: coredumps-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }} @@ -417,7 +485,11 @@ jobs: path: ./**/core.[1-9]* - name: Upload utilization stats +<<<<<<< HEAD if: ${{ always() && steps.test.conclusion && steps.test.conclusion != 'skipped' && !inputs.disable-monitor }} +======= + if: ${{ always() && steps.test.conclusion && steps.test.conclusion != 'skipped' && !inputs.disable-monitor && inputs.build-environment != 'linux-s390x-binary-manywheel' }} +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) continue-on-error: true uses: ./.github/actions/upload-utilization-stats with: @@ -428,7 +500,11 @@ jobs: workflow_attempt: ${{github.run_attempt}} - name: Teardown Linux +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/teardown-linux@release/2.7 +======= + uses: pytorch/test-infra/.github/actions/teardown-linux@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if: always() && steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false' # NB: We are currently having an intermittent GPU-related issue on G5 runners with @@ -445,8 +521,11 @@ jobs: - name: Check NVIDIA driver installation step if: failure() && steps.install-nvidia-driver.outcome && steps.install-nvidia-driver.outcome != 'skipped' shell: bash +<<<<<<< HEAD env: RUNNER_WORKSPACE: ${{ runner.workspace }} +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) run: | set +e set -x diff --git a/.github/workflows/_mac-build.yml b/.github/workflows/_mac-build.yml index 0c0d42d398a6..f42e768b62d7 100644 --- a/.github/workflows/_mac-build.yml +++ b/.github/workflows/_mac-build.yml @@ -30,6 +30,7 @@ on: python-version: required: false type: string +<<<<<<< HEAD default: "3.9" description: | The python version to be used. Will be 3.9 by default @@ -37,6 +38,11 @@ on: required: false type: string description: Set the conda environment file used to setup macOS build. +======= + default: "3.12" + description: | + The python version to be used. Will be 3.9 by default +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) test-matrix: required: false type: string @@ -71,11 +77,19 @@ jobs: test-matrix: ${{ steps.filter.outputs.test-matrix }} steps: - name: Clean up disk space before running MacOS workflow +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/check-disk-space@release/2.7 # [see note: pytorch repo ref] - name: Checkout PyTorch uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.7 +======= + uses: pytorch/test-infra/.github/actions/check-disk-space@release/2.8 + + # [see note: pytorch repo ref] + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: Set xcode version env: @@ -85,6 +99,7 @@ jobs: echo "DEVELOPER_DIR=/Applications/Xcode_${XCODE_VERSION}.app/Contents/Developer" >> "${GITHUB_ENV}" fi +<<<<<<< HEAD - name: Setup miniconda if: inputs.environment-file == '' uses: pytorch/test-infra/.github/actions/setup-miniconda@release/2.7 @@ -105,6 +120,16 @@ jobs: - name: Install sccache (only for non-forked PRs, and pushes to trunk) uses: nick-fields/retry@v3.0.0 +======= + - name: Setup Python + uses: pytorch/test-infra/.github/actions/setup-python@release/2.8 + with: + python-version: ${{ inputs.python-version }} + pip-requirements-file: .github/requirements/pip-requirements-macOS.txt + + - name: Install sccache (only for non-forked PRs, and pushes to trunk) + uses: nick-fields/retry@7152eba30c6575329ac0576536151aca5a72780e # v3.0.0 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }} with: timeout_minutes: 5 @@ -186,7 +211,11 @@ jobs: zip -1 -r artifacts.zip dist/ build/.ninja_log build/compile_commands.json .additional_ci_files - name: Store PyTorch Build Artifacts on GHA +<<<<<<< HEAD uses: actions/upload-artifact@v4 +======= + uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if: inputs.build-generates-artifacts && steps.build.outcome != 'skipped' with: name: ${{ env.BUILD_ENVIRONMENT }} @@ -195,7 +224,11 @@ jobs: path: artifacts.zip - name: Upload sccache stats to GHA +<<<<<<< HEAD uses: actions/upload-artifact@v4 +======= + uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # Only if sccache is installed, see above if: ${{ (github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository) && steps.build.outcome != 'skipped' }} with: @@ -207,4 +240,8 @@ jobs: - name: Clean up disk space if: always() continue-on-error: true +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/check-disk-space@release/2.7 +======= + uses: pytorch/test-infra/.github/actions/check-disk-space@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) diff --git a/.github/workflows/_mac-test.yml b/.github/workflows/_mac-test.yml index 013461825f9a..8a2632b7d1ee 100644 --- a/.github/workflows/_mac-test.yml +++ b/.github/workflows/_mac-test.yml @@ -21,7 +21,11 @@ on: python-version: required: false type: string +<<<<<<< HEAD default: "3.9" +======= + default: "3.12" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) description: | The python version to be used. Will be 3.9 by default timeout-minutes: @@ -38,13 +42,36 @@ on: required: false type: boolean default: true +<<<<<<< HEAD +======= + monitor-log-interval: + description: | + Set the interval for the monitor script to log utilization. + required: false + type: number + default: 5 + monitor-data-collect-interval: + description: | + Set the interval for the monitor script to collect data. + required: false + type: number + default: 1 + secrets: + HUGGING_FACE_HUB_TOKEN: + required: false + description: | + HF Auth token to avoid rate limits when downloading models or datasets from hub +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) jobs: test: # Don't run on forked repos or empty test matrix if: github.repository_owner == 'pytorch' && toJSON(fromJSON(inputs.test-matrix).include) != '[]' +<<<<<<< HEAD # For setup-miniconda, see https://github.com/conda-incubator/setup-miniconda/issues/179 # Also ensure that we always run with the right architecture +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) defaults: run: shell: bash -e -l {0} @@ -73,6 +100,13 @@ jobs: pkill "${PROCESS}" || true done +<<<<<<< HEAD +======= + - name: Clean up leftover miniconda installation + continue-on-error: true + run: brew uninstall miniconda || true + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: Clean up leftover local python3 site-packages on MacOS pet runner continue-on-error: true run: | @@ -82,18 +116,46 @@ jobs: done - name: Clean up disk space before running MacOS workflow +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/check-disk-space@release/2.7 # [see note: pytorch repo ref] - name: Checkout PyTorch uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.7 +======= + uses: pytorch/test-infra/.github/actions/check-disk-space@release/2.8 + + # [see note: pytorch repo ref] + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.8 + + - name: Get workflow job id + id: get-job-id + uses: ./.github/actions/get-workflow-job-id + if: always() + with: + github-token: ${{ secrets.GITHUB_TOKEN }} +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: Start monitoring script id: monitor-script if: ${{ !inputs.disable-monitor }} continue-on-error: true +<<<<<<< HEAD run: | ${CONDA_RUN} python3 -m tools.stats.monitor > usage_log.txt 2>&1 & +======= + env: + JOB_ID: ${{ steps.get-job-id.outputs.job-id }} + JOB_NAME: ${{ steps.get-job-id.outputs.job-name }} + WORKFLOW_NAME: ${{ github.workflow }} + WORKFLOW_RUN_ID: ${{github.run_id}} + MONITOR_LOG_INTERVAL: ${{ inputs.monitor-log-interval }} + MONITOR_DATA_COLLECT_INTERVAL: ${{ inputs.monitor-data-collect-interval }} + run: | + python3 -m pip install psutil==5.9.1 dataclasses_json==0.6.7 + python3 -m tools.stats.monitor --log-interval "$MONITOR_LOG_INTERVAL" --data-collect-interval "$MONITOR_DATA_COLLECT_INTERVAL" > usage_log.txt 2>&1 & +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) echo "monitor-script-pid=${!}" >> "${GITHUB_OUTPUT}" - name: Download build artifacts @@ -108,17 +170,27 @@ jobs: with: use-gha: true +<<<<<<< HEAD - name: Setup miniconda uses: pytorch/test-infra/.github/actions/setup-miniconda@release/2.7 with: python-version: ${{ inputs.python-version }} environment-file: .github/requirements/conda-env-${{ runner.os }}-${{ runner.arch }} pip-requirements-file: .github/requirements/pip-requirements-${{ runner.os }}.txt +======= + - name: Setup Python + uses: pytorch/test-infra/.github/actions/setup-python@release/2.8 + with: + python-version: ${{ inputs.python-version }} + pip-requirements-file: .github/requirements/pip-requirements-macOS.txt + default-packages: "" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: Parse ref id: parse-ref run: .github/scripts/parse_ref.py +<<<<<<< HEAD - name: Get workflow job id id: get-job-id uses: ./.github/actions/get-workflow-job-id @@ -128,6 +200,10 @@ jobs: - name: Check for keep-going label and re-enabled test issues # This uses the filter-test-configs action because it conviniently +======= + - name: Check for keep-going label and re-enabled test issues + # This uses the filter-test-configs action because it conveniently +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # checks for labels and re-enabled test issues. It does not actually do # any filtering. All filtering is done in the build step. id: keep-going @@ -166,10 +242,15 @@ jobs: JOB_ID: ${{ steps.get-job-id.outputs.job-id }} JOB_NAME: ${{ steps.get-job-id.outputs.job-name }} REENABLED_ISSUES: ${{ steps.keep-going.outputs.reenabled-issues }} +<<<<<<< HEAD +======= + HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) run: | # shellcheck disable=SC1090 set -ex +<<<<<<< HEAD arch if [[ -n "$CONDA_ENV" ]]; then @@ -186,21 +267,46 @@ jobs: ${CONDA_RUN} python --version ${CONDA_RUN} python3 -mpip install --no-index --no-deps dist/*.whl +======= + # TODO: Remove me later, and properly activate venv + PATH="$(dirname "$(which python)"):$PATH" + export PATH + + # Print out some information about the test environment + for tool in python3 python; do + which $tool + $tool --version + done + + python3 -mpip install --no-index --no-deps dist/*.whl +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) set +e pushd "${RUNNER_TEMP}" # Install pip dependencies if they are not found. This is to mitigate a peculiar # flaky missing dependencies on MacOS +<<<<<<< HEAD ${CONDA_RUN} python3 -c "import torch" +======= + python3 -c "import torch" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) RC=$? popd if [ "${RC}" -ne 0 ]; then +<<<<<<< HEAD ${CONDA_RUN} python3 -mpip install --ignore-installed -r "${PIP_REQUIREMENTS_FILE}" fi set -e ${CONDA_RUN} .ci/pytorch/macos-test.sh +======= + python3 -mpip install --ignore-installed -r "${PIP_REQUIREMENTS_FILE}" + fi + set -e + + .ci/pytorch/macos-test.sh +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: Print remaining test logs shell: bash @@ -208,6 +314,16 @@ jobs: run: | cat test/**/*_toprint.log || true +<<<<<<< HEAD +======= + - name: Run OP benchmark + shell: bash + if: ${{ contains(steps.get-job-id.outputs.job-name, 'mps') }} + run: | + python3 test/bench_mps_ops.py + + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: Stop monitoring script if: ${{ always() && steps.monitor-script.outputs.monitor-script-pid }} continue-on-error: true @@ -224,14 +340,37 @@ jobs: file-suffix: ${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}_${{ steps.get-job-id.outputs.job-id }} - name: Upload the benchmark results +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/upload-benchmark-results@release/2.7 +======= + uses: pytorch/test-infra/.github/actions/upload-benchmark-results@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: benchmark-results-dir: test/test-reports dry-run: false schema-version: v3 github-token: ${{ secrets.GITHUB_TOKEN }} +<<<<<<< HEAD - name: Clean up disk space if: always() continue-on-error: true uses: pytorch/test-infra/.github/actions/check-disk-space@release/2.7 +======= + - name: Upload utilization stats + if: ${{ always() && steps.test.conclusion && steps.test.conclusion != 'skipped' && !inputs.disable-monitor }} + continue-on-error: true + uses: ./.github/actions/upload-utilization-stats + with: + job_id: ${{ steps.get-job-id.outputs.job-id }} + job_name: ${{ steps.get-job-id.outputs.job-name }} + workflow_name: ${{ github.workflow }} + workflow_run_id: ${{github.run_id}} + workflow_attempt: ${{github.run_attempt}} + local_path: usage_log.txt + + - name: Clean up disk space + if: always() + continue-on-error: true + uses: pytorch/test-infra/.github/actions/check-disk-space@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) diff --git a/.github/workflows/_rocm-test.yml b/.github/workflows/_rocm-test.yml index babcc4c9bac9..1c01ef54fe8f 100644 --- a/.github/workflows/_rocm-test.yml +++ b/.github/workflows/_rocm-test.yml @@ -50,7 +50,22 @@ on: required: false type: boolean default: true +<<<<<<< HEAD +======= + monitor-log-interval: + description: | + Set the interval for the monitor script to log utilization. + required: false + type: number + default: 5 + monitor-data-collect-interval: + description: | + Set the interval for the monitor script to collect data. + required: false + type: number + default: 1 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) env: GIT_DEFAULT_BRANCH: ${{ github.event.repository.default_branch }} @@ -70,7 +85,11 @@ jobs: steps: # [see note: pytorch repo ref] - name: Checkout PyTorch +<<<<<<< HEAD uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.7 +======= + uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: no-sudo: true @@ -79,7 +98,11 @@ jobs: - name: configure aws credentials id: aws_creds +<<<<<<< HEAD uses: aws-actions/configure-aws-credentials@v4 +======= + uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only aws-region: us-east-1 @@ -88,27 +111,63 @@ jobs: - name: Login to Amazon ECR id: login-ecr continue-on-error: true +<<<<<<< HEAD uses: aws-actions/amazon-ecr-login@v2 - name: Calculate docker image id: calculate-docker-image uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.7 +======= + uses: aws-actions/amazon-ecr-login@062b18b96a7aff071d4dc91bc00c4c1a7945b076 # v2.0.1 + + - name: Calculate docker image + id: calculate-docker-image + uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: docker-image-name: ${{ inputs.docker-image }} - name: Pull docker image +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.7 with: docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} - name: Start monitoring script id: monitor-script +======= + uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.8 + with: + docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} + + - name: Get workflow job id + id: get-job-id + uses: ./.github/actions/get-workflow-job-id + if: always() + with: + github-token: ${{ secrets.GITHUB_TOKEN }} + + - name: Start monitoring script + id: monitor-script + env: + JOB_ID: ${{ steps.get-job-id.outputs.job-id }} + JOB_NAME: ${{ steps.get-job-id.outputs.job-name }} + WORKFLOW_NAME: ${{ github.workflow }} + WORKFLOW_RUN_ID: ${{github.run_id}} + MONITOR_LOG_INTERVAL: ${{ inputs.monitor-log-interval }} + MONITOR_DATA_COLLECT_INTERVAL: ${{ inputs.monitor-data-collect-interval }} +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if: ${{ !inputs.disable-monitor }} shell: bash continue-on-error: true run: | +<<<<<<< HEAD python3 -m pip install psutil==5.9.1 nvidia-ml-py==11.525.84 python3 -m tools.stats.monitor > usage_log.txt 2>&1 & +======= + python3 -m pip install psutil==5.9.1 dataclasses_json==0.6.7 + python3 -m tools.stats.monitor --log-interval "$MONITOR_LOG_INTERVAL" --data-collect-interval "$MONITOR_DATA_COLLECT_INTERVAL" > usage_log.txt 2>&1 & +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) echo "monitor-script-pid=${!}" >> "${GITHUB_OUTPUT}" - name: Download build artifacts @@ -124,6 +183,7 @@ jobs: id: parse-ref run: .github/scripts/parse_ref.py +<<<<<<< HEAD - name: Get workflow job id id: get-job-id uses: ./.github/actions/get-workflow-job-id @@ -133,6 +193,10 @@ jobs: - name: Check for keep-going label and re-enabled test issues # This uses the filter-test-configs action because it conviniently +======= + - name: Check for keep-going label and re-enabled test issues + # This uses the filter-test-configs action because it conveniently +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # checks for labels and re-enabled test issues. It does not actually do # any filtering. All filtering is done in the build step. id: keep-going @@ -285,7 +349,11 @@ jobs: find . -iname "core.[1-9]*" -exec docker exec "${CONTAINER_NAME}" sh -c "gdb python {} -ex 'bt' -ex 'q'" \; - name: Store Core dumps on GitHub +<<<<<<< HEAD uses: actions/upload-artifact@v4 +======= + uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if: failure() with: name: coredumps-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }} @@ -294,7 +362,11 @@ jobs: path: ./**/core.[1-9]* - name: Authenticate with AWS +<<<<<<< HEAD uses: aws-actions/configure-aws-credentials@v4 +======= + uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_upload-benchmark-results # The max duration enforced by the server side @@ -302,12 +374,30 @@ jobs: aws-region: us-east-1 - name: Upload the benchmark results +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/upload-benchmark-results@release/2.7 +======= + uses: pytorch/test-infra/.github/actions/upload-benchmark-results@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: benchmark-results-dir: test/test-reports dry-run: false schema-version: v3 github-token: ${{ secrets.GITHUB_TOKEN }} +<<<<<<< HEAD +======= + - name: Upload utilization stats + if: ${{ always() && steps.test.conclusion && steps.test.conclusion != 'skipped' && !inputs.disable-monitor }} + continue-on-error: true + uses: ./.github/actions/upload-utilization-stats + with: + job_id: ${{ steps.get-job-id.outputs.job-id }} + job_name: ${{ steps.get-job-id.outputs.job-name }} + workflow_name: ${{ github.workflow }} + workflow_run_id: ${{github.run_id}} + workflow_attempt: ${{github.run_attempt}} + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: Teardown ROCm uses: ./.github/actions/teardown-rocm diff --git a/.github/workflows/_runner-determinator.yml b/.github/workflows/_runner-determinator.yml index b608a71c055a..ded600fa9230 100644 --- a/.github/workflows/_runner-determinator.yml +++ b/.github/workflows/_runner-determinator.yml @@ -7,7 +7,15 @@ on: required: false type: string description: | +<<<<<<< HEAD List of experiments for this workfow. If not defined, all default experiments are included. +======= + List of experiments for this workflow. If not defined, all default experiments are included. + opt_out_experiments: + required: false + type: string + description: Comma-separated list of experiments this workflow will opt-out of. +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) triggering_actor: required: true type: string @@ -51,10 +59,18 @@ jobs: TRIGGERING_ACTOR: ${{ inputs.triggering_actor }} ISSUE_OWNER: ${{ inputs.issue_owner }} CHECK_EXPERIMENTS: ${{ inputs.check_experiments }} +<<<<<<< HEAD PR_NUMBER: ${{ github.event.pull_request.number }} steps: # - name: Checkout PyTorch # uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.7 +======= + OPT_OUT_EXPERIMENTS: ${{ inputs.opt_out_experiments }} + PR_NUMBER: ${{ github.event.pull_request.number }} + steps: + # - name: Checkout PyTorch + # uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # with: # fetch-depth: 1 # submodules: true @@ -267,6 +283,19 @@ jobs: help="comma separated list of experiments to check, if omitted all experiments marked with default=True are checked", ) parser.add_argument( +<<<<<<< HEAD +======= + "--opt-out-experiments", + type=_str_comma_separated_to_set, + required=False, + default="", + help=( + "comma separated list of experiments to opt-out of. If unset, no opt-outs will occur. " + "If the same experiment is listed both here and in '--eligible-experiments' opt-out will take priority." + ), + ) + parser.add_argument( +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) "--pr-number", type=str, required=False, @@ -490,6 +519,10 @@ jobs: workflow_requestors: Iterable[str], branch: str, eligible_experiments: frozenset[str] = frozenset(), +<<<<<<< HEAD +======= + opt_out_experiments: frozenset[str] = frozenset(), +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) is_canary: bool = False, ) -> str: settings = parse_settings(rollout_state) @@ -504,6 +537,17 @@ jobs: ) continue +<<<<<<< HEAD +======= + if opt_out_experiments: + if experiment_name in opt_out_experiments: + opt_out_exp_list = ", ".join(opt_out_experiments) + log.info( + f"Skipping experiment '{experiment_name}', as this workflow has opted-out (opted out experiments are: {opt_out_exp_list})" + ) + continue + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if eligible_experiments: if experiment_name not in eligible_experiments: exp_list = ", ".join(eligible_experiments) @@ -668,6 +712,10 @@ jobs: (args.github_issue_owner, username), args.github_branch, args.eligible_experiments, +<<<<<<< HEAD +======= + args.opt_out_experiments, +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) is_canary, ) @@ -705,4 +753,8 @@ jobs: --github-ref-type "$curr_ref_type" \ --github-repo "$GITHUB_REPOSITORY" \ --eligible-experiments "$CHECK_EXPERIMENTS" \ +<<<<<<< HEAD +======= + --opt-out-experiments "$OPT_OUT_EXPERIMENTS" \ +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) --pr-number "${PR_NUMBER}" diff --git a/.github/workflows/_win-build.yml b/.github/workflows/_win-build.yml index 27f75767b685..6a59a1f86286 100644 --- a/.github/workflows/_win-build.yml +++ b/.github/workflows/_win-build.yml @@ -23,7 +23,11 @@ on: vc-year: required: false type: string +<<<<<<< HEAD default: "2019" +======= + default: "2022" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) description: The Visual Studio year to use for building. build-with-debug: required: false @@ -84,10 +88,17 @@ jobs: git config --global core.fsmonitor false - name: Clean up leftover processes on non-ephemeral Windows runner +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/cleanup-runner@release/2.7 - name: Setup SSH (Click me for login details) uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7 +======= + uses: pytorch/test-infra/.github/actions/cleanup-runner@release/2.8 + + - name: Setup SSH (Click me for login details) + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: github-secret: ${{ secrets.GITHUB_TOKEN }} instructions: | @@ -98,11 +109,19 @@ jobs: To start build locally, change working folder to \actions-runner\_work\pytorch\pytorch, Activate miniconda and Visual Studio environment, by running: call C:\Jenkins\Miniconda3\Scripts\activate.bat C:\Jenkins\Miniconda3 +<<<<<<< HEAD call "C:\Program Files (x86)\Microsoft Visual Studio\2019\BuildTools\VC\Auxiliary\Build\vcvarsall.bat" x64 # [see note: pytorch repo ref] - name: Checkout PyTorch uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.7 +======= + call "C:\Program Files (x86)\Microsoft Visual Studio\2022\BuildTools\VC\Auxiliary\Build\vcvarsall.bat" x64 + + # [see note: pytorch repo ref] + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: no-sudo: true @@ -173,7 +192,11 @@ jobs: # Upload to github so that people can click and download artifacts - name: Upload artifacts to s3 if: steps.build.outcome != 'skipped' +<<<<<<< HEAD uses: seemethere/upload-artifact-s3@v5 +======= + uses: seemethere/upload-artifact-s3@baba72d0712b404f646cebe0730933554ebce96a # v5.1.0 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: retention-days: 14 if-no-files-found: error diff --git a/.github/workflows/_win-test.yml b/.github/workflows/_win-test.yml index 544e6389c46c..60c989ce58ac 100644 --- a/.github/workflows/_win-test.yml +++ b/.github/workflows/_win-test.yml @@ -36,7 +36,22 @@ on: required: false type: boolean default: true +<<<<<<< HEAD +======= + monitor-log-interval: + description: | + Set the interval for the monitor script to log utilization. + required: false + type: number + default: 5 + monitor-data-collect-interval: + description: | + Set the interval for the monitor script to collect data. + required: false + type: number + default: 1 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) env: GIT_DEFAULT_BRANCH: ${{ github.event.repository.default_branch }} @@ -66,10 +81,17 @@ jobs: git config --global core.fsmonitor false - name: Clean up leftover processes on non-ephemeral Windows runner +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/cleanup-runner@release/2.7 - name: Setup SSH (Click me for login details) uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7 +======= + uses: pytorch/test-infra/.github/actions/cleanup-runner@release/2.8 + + - name: Setup SSH (Click me for login details) + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: github-secret: ${{ secrets.GITHUB_TOKEN }} instructions: | @@ -80,12 +102,20 @@ jobs: To start tests locally, change working folder to \actions-runner\_work\pytorch\pytorch\test, Activate miniconda and Visual Studio environment and set PYTHON_PATH, by running: call C:\Jenkins\Miniconda3\Scripts\activate.bat C:\Jenkins\Miniconda3 +<<<<<<< HEAD call "C:\Program Files (x86)\Microsoft Visual Studio\2019\BuildTools\VC\Auxiliary\Build\vcvarsall.bat" x64 +======= + call "C:\Program Files (x86)\Microsoft Visual Studio\2022\BuildTools\VC\Auxiliary\Build\vcvarsall.bat" x64 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) set PYTHONPATH=C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build # [see note: pytorch repo ref] - name: Checkout PyTorch +<<<<<<< HEAD uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.7 +======= + uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: no-sudo: true @@ -96,7 +126,11 @@ jobs: # TODO: Move to a requirements.txt file for windows - name: Install pip dependencies +<<<<<<< HEAD uses: nick-fields/retry@v3.0.0 +======= + uses: nick-fields/retry@7152eba30c6575329ac0576536151aca5a72780e # v3.0.0 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: shell: bash timeout_minutes: 5 @@ -106,18 +140,46 @@ jobs: set -eu python3 -m pip install 'xdoctest>=1.1.0' +<<<<<<< HEAD - name: Start monitoring script id: monitor-script +======= + - name: Get workflow job id + id: get-job-id + uses: ./.github/actions/get-workflow-job-id + if: always() + with: + github-token: ${{ secrets.GITHUB_TOKEN }} + + - name: Start monitoring script + id: monitor-script + env: + JOB_ID: ${{ steps.get-job-id.outputs.job-id }} + JOB_NAME: ${{ steps.get-job-id.outputs.job-name }} + WORKFLOW_NAME: ${{ github.workflow }} + WORKFLOW_RUN_ID: ${{github.run_id}} + MONITOR_LOG_INTERVAL: ${{ inputs.monitor-log-interval }} + MONITOR_DATA_COLLECT_INTERVAL: ${{ inputs.monitor-data-collect-interval }} +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) shell: bash if: ${{ !inputs.disable-monitor }} continue-on-error: true run: | # Windows conda doesn't have python3 binary, only python, but it's python3 +<<<<<<< HEAD ${CONDA_RUN} python -m tools.stats.monitor > usage_log.txt 2>&1 & echo "monitor-script-pid=${!}" >> "${GITHUB_OUTPUT}" - name: Download PyTorch Build Artifacts uses: seemethere/download-artifact-s3@v4 +======= + ${CONDA_RUN} python -m pip install psutil==5.9.1 dataclasses_json==0.6.7 nvidia-ml-py==11.525.84 + ${CONDA_RUN} python -m tools.stats.monitor --log-interval "$MONITOR_LOG_INTERVAL" --data-collect-interval "$MONITOR_DATA_COLLECT_INTERVAL" > usage_log.txt 2>&1 & + echo "monitor-script-pid=${!}" >> "${GITHUB_OUTPUT}" + + - name: Download PyTorch Build Artifacts + uses: seemethere/download-artifact-s3@1da556a7aa0a088e3153970611f6c432d58e80e6 # v4.2.0 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: name: ${{ inputs.build-environment }} path: C:\${{ github.run_id }}\build-results @@ -131,6 +193,7 @@ jobs: continue-on-error: true uses: ./.github/actions/download-td-artifacts +<<<<<<< HEAD - name: Get workflow job id id: get-job-id uses: ./.github/actions/get-workflow-job-id @@ -140,6 +203,10 @@ jobs: - name: Check for keep-going label and re-enabled test issues # This uses the filter-test-configs action because it conviniently +======= + - name: Check for keep-going label and re-enabled test issues + # This uses the filter-test-configs action because it conveniently +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # checks for labels and re-enabled test issues. It does not actually do # any filtering. All filtering is done in the build step. id: keep-going @@ -172,8 +239,13 @@ jobs: NO_TD: ${{ steps.keep-going.outputs.ci-no-td }} VC_PRODUCT: "BuildTools" VC_VERSION: "" +<<<<<<< HEAD VS_VERSION: "16.8.6" VC_YEAR: "2019" +======= + VS_VERSION: "17.4.1" + VC_YEAR: "2022" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) AWS_DEFAULT_REGION: us-east-1 PR_NUMBER: ${{ github.event.pull_request.number }} GITHUB_REPOSITORY: ${{ github.repository }} @@ -236,6 +308,20 @@ jobs: with: file-suffix: ${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}_${{ steps.get-job-id.outputs.job-id }} +<<<<<<< HEAD +======= + - name: Upload utilization stats + if: ${{ always() && steps.test.conclusion && steps.test.conclusion != 'skipped' && !inputs.disable-monitor }} + continue-on-error: true + uses: ./.github/actions/upload-utilization-stats + with: + job_id: ${{ steps.get-job-id.outputs.job-id }} + job_name: ${{ steps.get-job-id.outputs.job-name }} + workflow_name: ${{ github.workflow }} + workflow_run_id: ${{github.run_id}} + workflow_attempt: ${{github.run_attempt}} + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: Parse ref id: parse-ref shell: bash diff --git a/.github/workflows/_xpu-test.yml b/.github/workflows/_xpu-test.yml index baee45d2e9b1..2d1185e51433 100644 --- a/.github/workflows/_xpu-test.yml +++ b/.github/workflows/_xpu-test.yml @@ -46,6 +46,24 @@ on: required: false type: boolean default: true +<<<<<<< HEAD +======= + monitor-log-interval: + description: | + Set the interval for the monitor script to log utilization. + required: false + type: number + default: 5 + monitor-data-collect-interval: + description: | + Set the interval for the monitor script to collect data. + required: false + type: number + default: 1 +permissions: + id-token: write + contents: read +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) env: GIT_DEFAULT_BRANCH: ${{ github.event.repository.default_branch }} @@ -62,20 +80,29 @@ jobs: steps: # [see note: pytorch repo ref] - name: Checkout PyTorch +<<<<<<< HEAD uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.7 +======= + uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: Setup XPU uses: ./.github/actions/setup-xpu - name: configure aws credentials id: aws_creds +<<<<<<< HEAD uses: aws-actions/configure-aws-credentials@v1.7.0 +======= + uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only aws-region: us-east-1 - name: Login to Amazon ECR id: login-ecr +<<<<<<< HEAD uses: aws-actions/amazon-ecr-login@v2 - name: Calculate docker image @@ -89,14 +116,58 @@ jobs: with: docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} +======= + uses: aws-actions/amazon-ecr-login@062b18b96a7aff071d4dc91bc00c4c1a7945b076 # v2.0.1 + + - name: Calculate docker image + id: calculate-docker-image + uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.8 + with: + docker-image-name: ${{ inputs.docker-image }} + + - name: Use following to pull public copy of the image + id: print-ghcr-mirror + env: + ECR_DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }} + shell: bash + run: | + tag=${ECR_DOCKER_IMAGE##*:} + echo "docker pull ghcr.io/pytorch/ci-image:${tag/:/-}" + + - name: Pull docker image + uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.8 + with: + docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} + + - name: Get workflow job id + id: get-job-id + uses: ./.github/actions/get-workflow-job-id + if: always() + with: + github-token: ${{ secrets.GITHUB_TOKEN }} + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: Start monitoring script id: monitor-script if: ${{ !inputs.disable-monitor }} shell: bash continue-on-error: true +<<<<<<< HEAD run: | python3 -m pip install psutil==5.9.1 nvidia-ml-py==11.525.84 python3 -m tools.stats.monitor > usage_log.txt 2>&1 & +======= + env: + JOB_ID: ${{ steps.get-job-id.outputs.job-id }} + JOB_NAME: ${{ steps.get-job-id.outputs.job-name }} + WORKFLOW_NAME: ${{ github.workflow }} + WORKFLOW_RUN_ID: ${{github.run_id}} + MONITOR_LOG_INTERVAL: ${{ inputs.monitor-log-interval }} + MONITOR_DATA_COLLECT_INTERVAL: ${{ inputs.monitor-data-collect-interval }} + run: | + python3 -m pip install psutil==5.9.1 dataclasses_json==0.6.7 nvidia-ml-py==11.525.84 + python3 -m tools.stats.monitor --log-interval "$MONITOR_LOG_INTERVAL" --data-collect-interval "$MONITOR_DATA_COLLECT_INTERVAL" > usage_log.txt 2>&1 & +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) echo "monitor-script-pid=${!}" >> "${GITHUB_OUTPUT}" - name: Download build artifacts @@ -108,6 +179,7 @@ jobs: id: parse-ref run: .github/scripts/parse_ref.py +<<<<<<< HEAD - name: Get workflow job id id: get-job-id uses: ./.github/actions/get-workflow-job-id @@ -117,6 +189,10 @@ jobs: - name: Check for keep-going label and re-enabled test issues # This uses the filter-test-configs action because it conviniently +======= + - name: Check for keep-going label and re-enabled test issues + # This uses the filter-test-configs action because it conveniently +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # checks for labels and re-enabled test issues. It does not actually do # any filtering. All filtering is done in the build step. id: keep-going @@ -244,6 +320,14 @@ jobs: # copy test results back to the mounted workspace, needed sudo, resulting permissions were correct docker exec -t "${{ env.CONTAINER_NAME }}" sh -c "cd ../pytorch && sudo cp -R test/test-reports ../workspace/test" +<<<<<<< HEAD +======= + - name: Change permissions + if: ${{ always() && steps.test.conclusion }} + run: | + docker exec -t "${{ env.CONTAINER_NAME }}" sh -c "sudo chown -R 1001:1001 test" + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: Print remaining test logs shell: bash if: always() && steps.test.conclusion @@ -266,6 +350,20 @@ jobs: use-gha: true file-suffix: ${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}_${{ steps.get-job-id.outputs.job-id }} +<<<<<<< HEAD +======= + - name: Upload utilization stats + if: ${{ always() && steps.test.conclusion && steps.test.conclusion != 'skipped' && !inputs.disable-monitor }} + continue-on-error: true + uses: ./.github/actions/upload-utilization-stats + with: + job_id: ${{ steps.get-job-id.outputs.job-id }} + job_name: ${{ steps.get-job-id.outputs.job-name }} + workflow_name: ${{ github.workflow }} + workflow_run_id: ${{github.run_id}} + workflow_attempt: ${{github.run_attempt}} + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: Collect backtraces from coredumps (if any) if: always() run: | @@ -279,7 +377,11 @@ jobs: docker stop "${{ env.CONTAINER_NAME }}" - name: Store Core dumps on GitHub +<<<<<<< HEAD uses: actions/upload-artifact@v4 +======= + uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if: failure() with: name: coredumps-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }} diff --git a/.github/workflows/assigntome-docathon.yml b/.github/workflows/assigntome-docathon.yml index 31fa28289b04..5bac7d3872c5 100644 --- a/.github/workflows/assigntome-docathon.yml +++ b/.github/workflows/assigntome-docathon.yml @@ -12,7 +12,11 @@ jobs: issues: write steps: - name: Check for "/assigntome" in comment +<<<<<<< HEAD uses: actions/github-script@v6 +======= + uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} with: @@ -28,14 +32,22 @@ jobs: repo: context.repo.repo, issue_number: issueNumber }); +<<<<<<< HEAD const hasLabel = issue.labels.some(label => label.name === 'docathon-h1-2024'); +======= + const hasLabel = issue.labels.some(label => label.name === 'docathon-h1-2025'); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if (hasLabel) { if (issue.assignee !== null) { await github.rest.issues.createComment({ owner: context.repo.owner, repo: context.repo.repo, issue_number: issueNumber, +<<<<<<< HEAD body: "The issue is already assigned. Please pick an opened and unnasigned issue with the [docathon-h1-2024 label](https://github.com/pytorch/pytorch/issues?q=is%3Aopen+is%3Aissue+label%3Adocathon-h1-2024)." +======= + body: "The issue is already assigned. Please pick an opened and unnasigned issue with the [docathon-h1-2025 label](https://github.com/pytorch/pytorch/issues?q=is%3Aopen+is%3Aissue+label%3Adocathon-h1-2025)." +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) }); } else { await github.rest.issues.addAssignees({ @@ -46,7 +58,11 @@ jobs: }); } } else { +<<<<<<< HEAD const commmentMessage = "This issue does not have the correct label. Please pick an opened and unnasigned issue with the [docathon-h1-2024 label](https://github.com/pytorch/pytorch/issues?q=is%3Aopen+is%3Aissue+label%3Adocathon-h1-2024)." +======= + const commmentMessage = "This issue does not have the correct label. Please pick an opened and unnasigned issue with the [docathon-h1-2025 label](https://github.com/pytorch/pytorch/issues?q=is%3Aopen+is%3Aissue+label%3Adocathon-h1-2025)." +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) await github.rest.issues.createComment({ owner: context.repo.owner, repo: context.repo.repo, diff --git a/.github/workflows/auto_request_review.yml b/.github/workflows/auto_request_review.yml index 9aaf21193512..75138e8d3e78 100644 --- a/.github/workflows/auto_request_review.yml +++ b/.github/workflows/auto_request_review.yml @@ -15,7 +15,11 @@ jobs: steps: - name: Request review based on files changes and/or groups the author belongs to # v0.7.0 +<<<<<<< HEAD uses: necojackarc/auto-request-review@e08cdffa277d50854744de3f76230260e61c67f4 +======= + uses: necojackarc/auto-request-review@e08cdffa277d50854744de3f76230260e61c67f4 # v0.7.0 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: token: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/build-almalinux-images.yml b/.github/workflows/build-almalinux-images.yml index 68aa873037f0..ccecd7e64ab9 100644 --- a/.github/workflows/build-almalinux-images.yml +++ b/.github/workflows/build-almalinux-images.yml @@ -11,6 +11,7 @@ on: # Release candidate tags look like: v1.11.0-rc1 - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+ paths: +<<<<<<< HEAD - '.ci/docker/almalinux/*' - '.ci/docker/common/*' - .github/workflows/build-almalinux-images.yml @@ -19,11 +20,25 @@ on: - '.ci/docker/almalinux/*' - '.ci/docker/common/*' - .github/workflows/build-almalinux-images.yml +======= + - .ci/docker/** + - .github/workflows/build-almalinux-images.yml + - .github/actions/binary-docker-build/** + pull_request: + paths: + - .ci/docker/** + - .github/workflows/build-almalinux-images.yml + - .github/actions/binary-docker-build/** +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) env: DOCKER_REGISTRY: "docker.io" DOCKER_BUILDKIT: 1 +<<<<<<< HEAD WITH_PUSH: ${{ github.event_name == 'push' && (github.ref == 'refs/heads/main' || startsWith(github.ref, 'refs/heads/release')) }} +======= + WITH_PUSH: ${{ github.event_name == 'push' && (github.ref == 'refs/heads/main' || startsWith(github.ref, 'refs/heads/release') || startsWith(github.ref, 'refs/tags/v')) }} +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} @@ -32,6 +47,7 @@ concurrency: jobs: build-docker: if: github.repository_owner == 'pytorch' +<<<<<<< HEAD environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }} runs-on: linux.9xlarge.ephemeral strategy: @@ -71,3 +87,19 @@ jobs: retry_wait_seconds: 90 command: | .ci/docker/almalinux/build.sh almalinux-builder${{ matrix.cuda_version == 'cpu' && ':' || ':cuda' }}${{matrix.cuda_version}} +======= + environment: ${{ (github.event_name == 'push' && (github.ref == 'refs/heads/main' || startsWith(github.ref, 'refs/heads/release') || startsWith(github.ref, 'refs/tags/v')) && 'docker-build') || '' }} + runs-on: linux.9xlarge.ephemeral + strategy: + matrix: + tag: ["cuda12.6", "cuda12.8", "cuda12.9", "rocm6.3", "rocm6.4", "cpu"] + steps: + - name: Build docker image + uses: pytorch/pytorch/.github/actions/binary-docker-build@release/2.8 + with: + docker-image-name: almalinux-builder + custom-tag-prefix: ${{matrix.tag}} + docker-build-dir: almalinux + DOCKER_TOKEN: ${{ secrets.DOCKER_TOKEN }} + DOCKER_ID: ${{ secrets.DOCKER_ID }} +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) diff --git a/.github/workflows/build-libtorch-images.yml b/.github/workflows/build-libtorch-images.yml index 3372888cf848..1d52e860f9ba 100644 --- a/.github/workflows/build-libtorch-images.yml +++ b/.github/workflows/build-libtorch-images.yml @@ -10,6 +10,7 @@ on: # Release candidate tags look like: v1.11.0-rc1 - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+ paths: +<<<<<<< HEAD - '.ci/docker/libtorch/*' - '.ci/docker/common/*' - .github/workflows/build-libtorch-images.yml @@ -18,11 +19,25 @@ on: - '.ci/docker/libtorch/*' - '.ci/docker/common/*' - .github/workflows/build-libtorch-images.yml +======= + - .ci/docker/** + - .github/workflows/build-libtorch-images.yml + - .github/actions/binary-docker-build/** + pull_request: + paths: + - .ci/docker/** + - .github/workflows/build-libtorch-images.yml + - .github/actions/binary-docker-build/** +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) env: DOCKER_REGISTRY: "docker.io" DOCKER_BUILDKIT: 1 +<<<<<<< HEAD WITH_PUSH: ${{ github.event_name == 'push' && (github.ref == 'refs/heads/main' || startsWith(github.ref, 'refs/heads/release')) }} +======= + WITH_PUSH: ${{ github.event_name == 'push' && (github.ref == 'refs/heads/main' || startsWith(github.ref, 'refs/heads/release') || startsWith(github.ref, 'refs/tags/v')) }} +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} @@ -32,13 +47,18 @@ jobs: get-label-type: if: github.repository_owner == 'pytorch' name: get-label-type +<<<<<<< HEAD uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.7 +======= + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: triggering_actor: ${{ github.triggering_actor }} issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} curr_branch: ${{ github.head_ref || github.ref_name }} curr_ref_type: ${{ github.ref_type }} +<<<<<<< HEAD build-docker-cuda: environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }} needs: get-label-type @@ -159,3 +179,30 @@ jobs: retry_wait_seconds: 90 command: | .ci/docker/libtorch/build.sh libtorch-cxx11-builder:cpu +======= + build: + environment: ${{ (github.event_name == 'push' && (github.ref == 'refs/heads/main' || startsWith(github.ref, 'refs/heads/release') || startsWith(github.ref, 'refs/tags/v')) && 'docker-build') || '' }} + needs: get-label-type + runs-on: ${{ needs.get-label-type.outputs.label-type }}linux.9xlarge.ephemeral + name: libtorch-cxx11-builder:${{ matrix.tag }} + strategy: + fail-fast: false + matrix: + include: [ + { tag: "cuda12.9" }, + { tag: "cuda12.8" }, + { tag: "cuda12.6" }, + { tag: "rocm6.3" }, + { tag: "rocm6.4" }, + { tag: "cpu" }, + ] + steps: + - name: Build docker image + uses: pytorch/pytorch/.github/actions/binary-docker-build@release/2.8 + with: + docker-image-name: libtorch-cxx11-builder + custom-tag-prefix: ${{ matrix.tag }} + docker-build-dir: libtorch + DOCKER_TOKEN: ${{ secrets.DOCKER_TOKEN }} + DOCKER_ID: ${{ secrets.DOCKER_ID }} +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) diff --git a/.github/workflows/build-magma-linux.yml b/.github/workflows/build-magma-linux.yml index aeaf6e6717a8..cd57d925bc02 100644 --- a/.github/workflows/build-magma-linux.yml +++ b/.github/workflows/build-magma-linux.yml @@ -34,23 +34,38 @@ jobs: id-token: write strategy: matrix: +<<<<<<< HEAD cuda_version: ["128", "126", "124", "118"] steps: - name: Checkout PyTorch uses: actions/checkout@v4 +======= + cuda_version: ["129", "128", "126"] + steps: + - name: Checkout PyTorch + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: Build Magma Cuda working-directory: .ci/magma run: | # Produces artifacts under magma/output/linux-64/magma-cuda*.bz2 make magma-cuda${{ matrix.cuda_version }} - name: Save as artifact +<<<<<<< HEAD uses: actions/upload-artifact@v4 +======= + uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: path: .ci/magma/output/linux-64/magma-cuda*.bz2 name: artifact_${{ matrix.cuda_version }} - name: Configure AWS credentials(PyTorch account) if: ${{ github.event_name == 'push' && github.ref == 'refs/heads/main' }} +<<<<<<< HEAD uses: aws-actions/configure-aws-credentials@v3 +======= + uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_ossci_linux_windows_read_write aws-region: us-east-1 diff --git a/.github/workflows/build-magma-rocm-linux.yml b/.github/workflows/build-magma-rocm-linux.yml new file mode 100644 index 000000000000..b6eb09188fd4 --- /dev/null +++ b/.github/workflows/build-magma-rocm-linux.yml @@ -0,0 +1,69 @@ +name: build-linux-magma-rocm + +on: + push: + branches: + main + paths: + - .ci/magma-rocm/* + - .ci/magma-rocm/package_files/* + - .github/workflows/build-magma-rocm-linux.yml + pull_request: + paths: + - .ci/magma-rocm/* + - .ci/magma-rocm/package_files/* + - .github/workflows/build-magma-rocm-linux.yml + +defaults: + run: + shell: bash -x -e -l {0} +env: + BUILD_ENVIRONMENT: build-linux-magma-rocm + IN_CI: 1 + IS_GHA: 1 + +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} + cancel-in-progress: true + +jobs: + build-linux-magma-rocm: + if: github.repository_owner == 'pytorch' + runs-on: linux.2xlarge + permissions: + id-token: write + strategy: + matrix: + rocm_version: ["64", "63"] + steps: + - name: Checkout PyTorch + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + - name: Build Magma Rocm + working-directory: .ci/magma-rocm + run: | + # Produces artifacts under magma-rocm/output/linux-64/magma-rocm*.bz2 + make magma-rocm${{ matrix.rocm_version }} + - name: Save as artifact + uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2 + with: + path: .ci/magma-rocm/output/linux-64/magma-rocm*.bz2 + name: artifact_${{ matrix.rocm_version }} + - name: Configure AWS credentials(PyTorch account) + if: ${{ github.event_name == 'push' && github.ref == 'refs/heads/main' }} + uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0 + with: + role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_ossci_linux_windows_read_write + aws-region: us-east-1 + - name: Set DRY_RUN + if: ${{ github.event_name == 'push' && github.event.ref == 'refs/heads/main' }} + run: | + echo "DRY_RUN=disabled" >> "$GITHUB_ENV" + - name: Upload binaries + shell: bash + env: + PKG_DIR: ".ci/magma-rocm/output/linux-64/" + TARGET_OS: "linux" + PKG_INCLUDE: "magma-rocm*.tar.bz2" + run: | + set -ex + bash .github/scripts/upload_aws_ossci.sh diff --git a/.github/workflows/build-magma-windows.yml b/.github/workflows/build-magma-windows.yml index 9a1970a5feb7..168215334e3e 100644 --- a/.github/workflows/build-magma-windows.yml +++ b/.github/workflows/build-magma-windows.yml @@ -19,17 +19,31 @@ concurrency: jobs: build-windows-magma: if: github.repository_owner == 'pytorch' +<<<<<<< HEAD runs-on: windows-2019 strategy: matrix: cuda_version: ["128", "126", "124", "118"] +======= + runs-on: windows-2022 + strategy: + matrix: + cuda_version: ["129", "128", "126"] +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) config: ["Release", "Debug"] env: CUDA_VERSION: ${{ matrix.cuda_version }} CONFIG: ${{ matrix.config }} +<<<<<<< HEAD steps: - name: Checkout pytorch/pytorch uses: actions/checkout@v4 +======= + VC_YEAR: "2022" + steps: + - name: Checkout pytorch/pytorch + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: Enable MSVC dev commands to enable cl.exe # FYI incompatible with shell: bash uses: ilammy/msvc-dev-cmd@dd5e2fa0a7de1e7929605d9ecc020e749d9856a3 - name: Install CUDA Toolkit @@ -37,7 +51,11 @@ jobs: - name: Build MAGMA and push to S3 run: .github/scripts/windows/build_magma.bat - name: Save as artifact +<<<<<<< HEAD uses: actions/upload-artifact@v4 +======= + uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: path: magma_*_cuda*_*.7z name: artifact_${{ matrix.cuda_version }}_${{ matrix.config }} @@ -49,12 +67,21 @@ jobs: needs: build-windows-magma steps: - name: Checkout PyTorch +<<<<<<< HEAD uses: actions/checkout@v4 - name: Download all artifacts uses: actions/download-artifact@v4 - name: Configure AWS credentials(PyTorch account) if: ${{ github.event_name == 'push' && github.ref == 'refs/heads/main' }} uses: aws-actions/configure-aws-credentials@v3 +======= + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + - name: Download all artifacts + uses: actions/download-artifact@95815c38cf2ff2164869cbab79da8d1f422bc89e # v4.2.1 + - name: Configure AWS credentials(PyTorch account) + if: ${{ github.event_name == 'push' && github.ref == 'refs/heads/main' }} + uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_ossci_linux_windows_read_write aws-region: us-east-1 diff --git a/.github/workflows/build-manywheel-images-s390x.yml b/.github/workflows/build-manywheel-images-s390x.yml index decedf8a334b..43d4a023a4c2 100644 --- a/.github/workflows/build-manywheel-images-s390x.yml +++ b/.github/workflows/build-manywheel-images-s390x.yml @@ -3,6 +3,7 @@ name: Build manywheel docker images for s390x on: workflow_dispatch: push: +<<<<<<< HEAD branches: - main - release/* @@ -20,13 +21,22 @@ on: - '.ci/docker/manywheel/*' - '.ci/docker/manywheel/build_scripts/*' - '.ci/docker/common/*' +======= + tags: + - ciflow/s390/* + paths: +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - .github/workflows/build-manywheel-images-s390x.yml env: DOCKER_REGISTRY: "docker.io" DOCKER_BUILDKIT: 1 +<<<<<<< HEAD WITH_PUSH: ${{ github.event_name == 'push' && (github.ref == 'refs/heads/main' || startsWith(github.ref, 'refs/heads/release')) }} +======= + WITH_PUSH: ${{ github.event_name == 'push' && (github.ref == 'refs/heads/main' || startsWith(github.ref, 'refs/heads/release') || startsWith(github.ref, 'refs/tags/v')) }} +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} @@ -35,6 +45,7 @@ concurrency: jobs: build-docker-cpu-s390x: if: github.repository_owner == 'pytorch' +<<<<<<< HEAD environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }} runs-on: linux.s390x env: @@ -57,12 +68,66 @@ jobs: - name: Build Docker Image run: | .ci/docker/manywheel/build.sh manylinuxs390x-builder:cpu-s390x +======= + environment: ${{ (github.event_name == 'push' && (github.ref == 'refs/heads/main' || startsWith(github.ref, 'refs/heads/release') || startsWith(github.ref, 'refs/tags/v')) && 'docker-build') || '' }} + runs-on: linux.s390x + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.8 + with: + submodules: false + no-sudo: true + + - name: Build Docker Image + run: | + .ci/docker/manywheel/build.sh manylinuxs390x-builder:cpu-s390x -t manylinuxs390x-builder:cpu-s390x + + - name: Tag and (if WITH_PUSH) push docker image to docker.io + env: + DOCKER_TOKEN: ${{ secrets.DOCKER_TOKEN }} + DOCKER_ID: ${{ secrets.DOCKER_ID }} + CREATED_FULL_DOCKER_IMAGE_NAME: manylinuxs390x-builder:cpu-s390x + shell: bash + run: | + set -euox pipefail + GITHUB_REF="${GITHUB_REF:-$(git symbolic-ref -q HEAD || git describe --tags --exact-match)}" + GIT_BRANCH_NAME="${GITHUB_REF##*/}" + GIT_COMMIT_SHA="${GITHUB_SHA:-$(git rev-parse HEAD)}" + CI_FOLDER_SHA="$(git rev-parse HEAD:.ci/docker)" + + DOCKER_IMAGE_NAME_PREFIX="docker.io/pytorch/${CREATED_FULL_DOCKER_IMAGE_NAME}" + + docker tag "${CREATED_FULL_DOCKER_IMAGE_NAME}" "${DOCKER_IMAGE_NAME_PREFIX}-${GIT_BRANCH_NAME}" + docker tag "${CREATED_FULL_DOCKER_IMAGE_NAME}" "${DOCKER_IMAGE_NAME_PREFIX}-${GIT_COMMIT_SHA}" + docker tag "${CREATED_FULL_DOCKER_IMAGE_NAME}" "${DOCKER_IMAGE_NAME_PREFIX}-${CI_FOLDER_SHA}" + + # Pretty sure Github will mask tokens and I'm not sure if it will even be + # printed due to pipe, but just in case + set +x + if [[ "${WITH_PUSH:-false}" == "true" ]]; then + echo "${DOCKER_TOKEN}" | docker login -u "${DOCKER_ID}" --password-stdin + docker push "${DOCKER_IMAGE_NAME_PREFIX}-${GIT_BRANCH_NAME}" + docker push "${DOCKER_IMAGE_NAME_PREFIX}-${GIT_COMMIT_SHA}" + docker push "${DOCKER_IMAGE_NAME_PREFIX}-${CI_FOLDER_SHA}" + fi +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: Cleanup docker if: cancelled() shell: bash run: | +<<<<<<< HEAD # if podman build command is interrupted, # it can leave a couple of processes still running. # order them to stop for clean shutdown. +======= + # If podman build command is interrupted, + # it can leave a couple of processes still running. + # Order them to stop for clean shutdown. + # It looks like sometimes some processes remain + # after first cleanup. + # Wait a bit and do cleanup again. It looks like it helps. + docker system prune --build -f || true + sleep 60 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) docker system prune --build -f || true diff --git a/.github/workflows/build-manywheel-images.yml b/.github/workflows/build-manywheel-images.yml index 1eaf692414e3..d54e1a7f69d2 100644 --- a/.github/workflows/build-manywheel-images.yml +++ b/.github/workflows/build-manywheel-images.yml @@ -11,6 +11,7 @@ on: # Release candidate tags look like: v1.11.0-rc1 - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+ paths: +<<<<<<< HEAD - '.ci/docker/common/*' - '.ci/docker/manywheel/*' - '.ci/docker/manywheel/build_scripts/*' @@ -22,12 +23,26 @@ on: - '.ci/docker/manywheel/build_scripts/*' - .github/workflows/build-manywheel-images.yml +======= + - .ci/docker/** + - .github/workflows/build-manywheel-images.yml + - .github/actions/binary-docker-build/** + pull_request: + paths: + - .ci/docker/** + - .github/workflows/build-manywheel-images.yml + - .github/actions/binary-docker-build/** +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) env: DOCKER_REGISTRY: "docker.io" DOCKER_BUILDKIT: 1 +<<<<<<< HEAD WITH_PUSH: ${{ github.event_name == 'push' && (github.ref == 'refs/heads/main' || startsWith(github.ref, 'refs/heads/release')) }} +======= + WITH_PUSH: ${{ github.event_name == 'push' && (github.ref == 'refs/heads/main' || startsWith(github.ref, 'refs/heads/release') || startsWith(github.ref, 'refs/tags/v')) }} +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} cancel-in-progress: true @@ -36,13 +51,18 @@ jobs: get-label-type: if: github.repository_owner == 'pytorch' name: get-label-type +<<<<<<< HEAD uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.7 +======= + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: triggering_actor: ${{ github.triggering_actor }} issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} curr_branch: ${{ github.head_ref || github.ref_name }} curr_ref_type: ${{ github.ref_type }} +<<<<<<< HEAD build-docker-cuda-manylinux_2_28: environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }} needs: get-label-type @@ -362,3 +382,35 @@ jobs: retry_wait_seconds: 90 command: | .ci/docker/manywheel/build.sh manylinux2_28-builder:xpu +======= + build: + environment: ${{ (github.event_name == 'push' && (github.ref == 'refs/heads/main' || startsWith(github.ref, 'refs/heads/release') || startsWith(github.ref, 'refs/tags/v')) && 'docker-build') || '' }} + needs: get-label-type + strategy: + fail-fast: false + matrix: + include: [ + { name: "manylinux2_28-builder", tag: "cuda12.9", runner: "linux.9xlarge.ephemeral" }, + { name: "manylinux2_28-builder", tag: "cuda12.8", runner: "linux.9xlarge.ephemeral" }, + { name: "manylinux2_28-builder", tag: "cuda12.6", runner: "linux.9xlarge.ephemeral" }, + { name: "manylinuxaarch64-builder", tag: "cuda12.9", runner: "linux.arm64.2xlarge.ephemeral" }, + { name: "manylinuxaarch64-builder", tag: "cuda12.8", runner: "linux.arm64.2xlarge.ephemeral" }, + { name: "manylinux2_28-builder", tag: "rocm6.3", runner: "linux.9xlarge.ephemeral" }, + { name: "manylinux2_28-builder", tag: "rocm6.4", runner: "linux.9xlarge.ephemeral" }, + { name: "manylinux2_28-builder", tag: "cpu", runner: "linux.9xlarge.ephemeral" }, + { name: "manylinux2_28_aarch64-builder", tag: "cpu-aarch64", runner: "linux.arm64.2xlarge.ephemeral" }, + { name: "manylinuxcxx11-abi-builder", tag: "cpu-cxx11-abi", runner: "linux.9xlarge.ephemeral" }, + { name: "manylinux2_28-builder", tag: "xpu", runner: "linux.9xlarge.ephemeral" }, + ] + runs-on: ${{ needs.get-label-type.outputs.label-type }}${{ matrix.runner }} + name: ${{ matrix.name }}:${{ matrix.tag }} + steps: + - name: Build docker image + uses: pytorch/pytorch/.github/actions/binary-docker-build@release/2.8 + with: + docker-image-name: ${{ matrix.name }} + custom-tag-prefix: ${{ matrix.tag }} + docker-build-dir: manywheel + DOCKER_TOKEN: ${{ secrets.DOCKER_TOKEN }} + DOCKER_ID: ${{ secrets.DOCKER_ID }} +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) diff --git a/.github/workflows/build-triton-wheel.yml b/.github/workflows/build-triton-wheel.yml index 988d18fe736c..0650e34ea652 100644 --- a/.github/workflows/build-triton-wheel.yml +++ b/.github/workflows/build-triton-wheel.yml @@ -3,7 +3,11 @@ name: Build Triton wheels on: push: branches: +<<<<<<< HEAD - release/2.7 +======= + - release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) tags: # NOTE: Binary build pipelines should only get triggered on release candidate builds # Release candidate tags look like: v1.11.0-rc1 @@ -16,6 +20,10 @@ on: - .github/scripts/windows/build_triton.bat - .ci/docker/ci_commit_pins/triton.txt - .ci/docker/ci_commit_pins/triton-xpu.txt +<<<<<<< HEAD +======= + workflow_dispatch: +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) pull_request: paths: - .github/workflows/build-triton-wheel.yml @@ -34,7 +42,11 @@ jobs: get-label-type: if: github.repository_owner == 'pytorch' name: get-label-type +<<<<<<< HEAD uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.7 +======= + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: triggering_actor: ${{ github.triggering_actor }} issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} @@ -53,7 +65,11 @@ jobs: docker-image: ["pytorch/manylinux2_28-builder:cpu"] include: - device: "rocm" +<<<<<<< HEAD rocm_version: "6.3" +======= + rocm_version: "6.4" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) runs_on: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" - device: "cuda" rocm_version: "" @@ -72,12 +88,20 @@ jobs: PLATFORM: 'manylinux_2_28_x86_64' steps: - name: Setup SSH (Click me for login details) +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7 +======= + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: github-secret: ${{ secrets.GITHUB_TOKEN }} - name: Checkout PyTorch +<<<<<<< HEAD uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.7 +======= + uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: submodules: false @@ -85,7 +109,11 @@ jobs: uses: ./.github/actions/setup-linux - name: Pull Docker image +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.7 +======= + uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: docker-image: ${{ env.DOCKER_IMAGE }} @@ -138,6 +166,18 @@ jobs: docker exec -t "${container_name}" yum install -y zlib-devel zip docker exec -t "${container_name}" "${PYTHON_EXECUTABLE}" -m pip install -U setuptools==78.1.0 pybind11==2.13.1 auditwheel wheel +<<<<<<< HEAD +======= + set +e + docker exec -t "${container_name}" command -v pip + has_pip=$? + set -e + if [ $has_pip -eq 0 ] ; then + docker exec -t "${container_name}" pip install -U cmake --force-reinstall + else + docker exec -t "${container_name}" "${PYTHON_EXECUTABLE}" -m pip install -U cmake --force-reinstall + fi +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if [[ ("${{ matrix.device }}" == "cuda" || "${{ matrix.device }}" == "rocm" || "${{ matrix.device }}" == "aarch64" ) ]]; then # With this install, it gets clang 16.0.6. @@ -160,14 +200,22 @@ jobs: fi docker exec -t "${container_name}" chown -R 1000.1000 /artifacts/wheelhouse +<<<<<<< HEAD - uses: actions/upload-artifact@v4.4.0 +======= + - uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: name: pytorch-triton-wheel-${{ matrix.py_vers }}-${{ matrix.device }}-${{ env.PLATFORM }} if-no-files-found: error path: ${{ runner.temp }}/artifacts/wheelhouse/* - name: Teardown Linux +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/teardown-linux@release/2.7 +======= + uses: pytorch/test-infra/.github/actions/teardown-linux@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if: always() build-wheel-win: @@ -200,7 +248,11 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7 +======= + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -214,7 +266,11 @@ jobs: # in https://github.com/actions/checkout/issues/1018 git config --global core.fsmonitor false - name: Checkout PyTorch +<<<<<<< HEAD uses: actions/checkout@v4 +======= + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: false @@ -247,7 +303,11 @@ jobs: .github/scripts/windows/build_triton.bat mkdir -p "${RUNNER_TEMP}/artifacts/" mv ./*.whl "${RUNNER_TEMP}/artifacts/" +<<<<<<< HEAD - uses: actions/upload-artifact@v4.4.0 +======= + - uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: name: pytorch-triton-wheel-${{ matrix.py_vers }}-${{ matrix.device }} if-no-files-found: error @@ -266,24 +326,40 @@ jobs: image: continuumio/miniconda3:4.12.0 environment: ${{ (github.event_name == 'push' && github.event.ref == 'refs/heads/main') && 'nightly-wheel-upload' || '' }} steps: +<<<<<<< HEAD - uses: actions/checkout@v3 - name: Configure AWS credentials(PyTorch account) for main if: ${{ github.event_name == 'push' && github.event.ref == 'refs/heads/main' }} uses: aws-actions/configure-aws-credentials@v3 +======= + - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + + - name: Configure AWS credentials(PyTorch account) for main + if: ${{ github.event_name == 'push' && github.event.ref == 'refs/heads/main' }} + uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: role-to-assume: arn:aws:iam::749337293305:role/gha_workflow_nightly_build_wheels aws-region: us-east-1 - name: Configure AWS credentials(PyTorch account) for RC builds if: ${{ github.event_name == 'push' && (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/')) }} +<<<<<<< HEAD uses: aws-actions/configure-aws-credentials@v3 +======= + uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: role-to-assume: arn:aws:iam::749337293305:role/gha_workflow_test_build_wheels aws-region: us-east-1 - name: Download Build Artifacts +<<<<<<< HEAD uses: actions/download-artifact@v4.1.7 +======= + uses: actions/download-artifact@65a9edc5881444af0b9093a5e628f2fe47ea3b2e # v4.1.7 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: # Download all available artifacts path: ${{ runner.temp }}/artifacts-all diff --git a/.github/workflows/check-labels.yml b/.github/workflows/check-labels.yml index dc7ee1930920..aa7e64961d85 100644 --- a/.github/workflows/check-labels.yml +++ b/.github/workflows/check-labels.yml @@ -38,13 +38,21 @@ jobs: runs-on: linux.24_04.4x steps: - name: Checkout PyTorch +<<<<<<< HEAD uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.7 +======= + uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: submodules: false fetch-depth: 1 - name: Setup Python +<<<<<<< HEAD uses: actions/setup-python@v4 +======= + uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: python-version: '3.9' architecture: x64 diff --git a/.github/workflows/check_mergeability_ghstack.yml b/.github/workflows/check_mergeability_ghstack.yml index ddf5311cbf01..804ae479fe6c 100644 --- a/.github/workflows/check_mergeability_ghstack.yml +++ b/.github/workflows/check_mergeability_ghstack.yml @@ -10,7 +10,11 @@ jobs: if: github.repository_owner == 'pytorch' runs-on: ubuntu-latest steps: +<<<<<<< HEAD - uses: actions/checkout@v4 +======= + - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: fetch-depth: 0 @@ -50,7 +54,11 @@ jobs: fi - name: Setup Python +<<<<<<< HEAD uses: actions/setup-python@v4 +======= + uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: python-version: '3.9' cache: pip diff --git a/.github/workflows/cherry-pick.yml b/.github/workflows/cherry-pick.yml index d8eeeb6b4ec8..b8ce2452af99 100644 --- a/.github/workflows/cherry-pick.yml +++ b/.github/workflows/cherry-pick.yml @@ -14,13 +14,21 @@ jobs: steps: - name: Checkout repo id: checkout +<<<<<<< HEAD uses: actions/checkout@v3 +======= + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: fetch-depth: 0 token: ${{ secrets.GH_PYTORCHBOT_CHERRY_PICK_TOKEN }} - name: Setup Python +<<<<<<< HEAD uses: actions/setup-python@v4 +======= + uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: python-version: '3.11' cache: pip diff --git a/.github/workflows/close-nonexistent-disable-issues.yml b/.github/workflows/close-nonexistent-disable-issues.yml index b17789f9abe9..047e29cff981 100644 --- a/.github/workflows/close-nonexistent-disable-issues.yml +++ b/.github/workflows/close-nonexistent-disable-issues.yml @@ -13,7 +13,11 @@ jobs: runs-on: ubuntu-latest steps: - name: Checkout PyTorch +<<<<<<< HEAD uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.7 +======= + uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: submodules: false fetch-depth: 1 diff --git a/.github/workflows/create_release.yml b/.github/workflows/create_release.yml index c6bf6803c766..89b5ebd89d75 100644 --- a/.github/workflows/create_release.yml +++ b/.github/workflows/create_release.yml @@ -19,7 +19,11 @@ jobs: get-label-type: if: github.repository_owner == 'pytorch' name: get-label-type +<<<<<<< HEAD uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.7 +======= + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: triggering_actor: ${{ github.triggering_actor }} issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} @@ -36,7 +40,11 @@ jobs: outputs: pt_release_name: ${{ steps.release_name.outputs.pt_release_name }} steps: +<<<<<<< HEAD - uses: actions/checkout@v4 +======= + - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: show-progress: false submodules: 'recursive' @@ -55,6 +63,11 @@ jobs: tag_or_branch="${tag_or_branch//\//_}" echo "PT_RELEASE_NAME=pytorch-$tag_or_branch" >> "$GITHUB_ENV" echo "PT_RELEASE_FILE=pytorch-$tag_or_branch.tar.gz" >> "$GITHUB_ENV" +<<<<<<< HEAD +======= + - name: Checkout optional submodules + run: python3 tools/optional_submodules.py +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: Create source distribution run: | # Create new folder with specified name so extracting the archive yields that @@ -69,18 +82,30 @@ jobs: echo "Created source archive $PT_RELEASE_FILE with content: $(ls -a "$PT_RELEASE_NAME")" - name: Upload source distribution for release if: ${{ github.event_name == 'release' }} +<<<<<<< HEAD uses: softprops/action-gh-release@v1 +======= + uses: softprops/action-gh-release@da05d552573ad5aba039eaac05058a918a7bf631 # v2.2.2 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: files: ${{env.PT_RELEASE_FILE}} - name: Upload source distribution to GHA artifacts for release tags if: ${{ github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v') && contains(github.ref, 'rc') }} +<<<<<<< HEAD uses: actions/upload-artifact@v4.4.0 +======= + uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: name: ${{ env.PT_RELEASE_FILE }} path: ${{ env.PT_RELEASE_FILE }} - name: Set output id: release_name +<<<<<<< HEAD run: echo "::set-output name=pt_release_name::${{ env.PT_RELEASE_NAME }}.tar.gz" +======= + run: echo "pt_release_name=${{ env.PT_RELEASE_NAME }}.tar.gz" >> "${GITHUB_OUTPUT}" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) upload_source_code_to_s3: if: ${{ github.repository == 'pytorch/pytorch' && github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v') && contains(github.ref, 'rc') }} @@ -93,6 +118,7 @@ jobs: - get-label-type - release steps: +<<<<<<< HEAD - uses: actions/download-artifact@v4.1.7 with: name: ${{ needs.release.outputs.pt_release_name }} @@ -102,6 +128,17 @@ jobs: role-to-assume: arn:aws:iam::749337293305:role/gha_pytorch_source_code_upload_role aws-region: us-east-1 - uses: seemethere/upload-artifact-s3@v5 +======= + - uses: actions/download-artifact@65a9edc5881444af0b9093a5e628f2fe47ea3b2e # v4.1.7 + with: + name: ${{ needs.release.outputs.pt_release_name }} + - name: Configure AWS credentials(PyTorch account) + uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0 + with: + role-to-assume: arn:aws:iam::749337293305:role/gha_pytorch_source_code_upload_role + aws-region: us-east-1 + - uses: seemethere/upload-artifact-s3@baba72d0712b404f646cebe0730933554ebce96a # v5.1.0 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: s3-bucket: pytorch s3-prefix: source_code/test diff --git a/.github/workflows/delete_old_branches.yml b/.github/workflows/delete_old_branches.yml index eabb98e32065..803418360826 100644 --- a/.github/workflows/delete_old_branches.yml +++ b/.github/workflows/delete_old_branches.yml @@ -22,12 +22,20 @@ jobs: steps: - name: Checkout repo +<<<<<<< HEAD uses: actions/checkout@v3 +======= + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: fetch-depth: 0 - name: Setup Python +<<<<<<< HEAD uses: actions/setup-python@v4 +======= + uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: python-version: '3.11' architecture: x64 diff --git a/.github/workflows/docathon-sync-label.yml b/.github/workflows/docathon-sync-label.yml index 08703be573a6..65cb16e67d90 100644 --- a/.github/workflows/docathon-sync-label.yml +++ b/.github/workflows/docathon-sync-label.yml @@ -14,11 +14,19 @@ jobs: pull-requests: write steps: - name: Check out the repo +<<<<<<< HEAD uses: actions/checkout@v2 with: fetch-depth: 1 - name: Set up Python uses: actions/setup-python@v2 +======= + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + with: + fetch-depth: 1 + - name: Set up Python + uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: python-version: 3.x - name: Install dependencies diff --git a/.github/workflows/docker-builds.yml b/.github/workflows/docker-builds.yml index 903c81fd539e..b798cdf36703 100644 --- a/.github/workflows/docker-builds.yml +++ b/.github/workflows/docker-builds.yml @@ -33,7 +33,11 @@ jobs: get-label-type: if: github.repository_owner == 'pytorch' name: get-label-type +<<<<<<< HEAD uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.7 +======= + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: triggering_actor: ${{ github.triggering_actor }} issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} @@ -49,6 +53,7 @@ jobs: matrix: runner: [linux.12xlarge] docker-image-name: [ +<<<<<<< HEAD pytorch-linux-focal-cuda12.4-cudnn9-py3-gcc9-inductor-benchmarks, pytorch-linux-focal-cuda12.4-cudnn9-py3.12-gcc9-inductor-benchmarks, pytorch-linux-focal-cuda12.4-cudnn9-py3.13-gcc9-inductor-benchmarks, @@ -74,20 +79,54 @@ jobs: pytorch-linux-focal-py3-clang10-onnx, pytorch-linux-focal-linter, pytorch-linux-jammy-cuda11.8-cudnn9-py3.9-linter, +======= + pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11, + pytorch-linux-jammy-cuda12.6-cudnn9-py3-gcc9-inductor-benchmarks, + pytorch-linux-jammy-cuda12.6-cudnn9-py3.12-gcc9-inductor-benchmarks, + pytorch-linux-jammy-cuda12.6-cudnn9-py3.13-gcc9-inductor-benchmarks, + pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks, + pytorch-linux-jammy-cuda12.8-cudnn9-py3.12-gcc9-inductor-benchmarks, + pytorch-linux-jammy-cuda12.8-cudnn9-py3.13-gcc9-inductor-benchmarks, + pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9, + pytorch-linux-jammy-py3.9-clang12, + pytorch-linux-jammy-py3.11-clang12, + pytorch-linux-jammy-py3.12-clang12, + pytorch-linux-jammy-py3.13-clang12, + pytorch-linux-jammy-rocm-n-1-py3, + pytorch-linux-jammy-rocm-n-py3, + pytorch-linux-jammy-cuda12.8-cudnn9-py3.9-clang12, + pytorch-linux-jammy-py3.9-gcc11, + pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks, + pytorch-linux-jammy-py3.12-halide, + pytorch-linux-jammy-xpu-2025.0-py3, + pytorch-linux-jammy-xpu-2025.1-py3, + pytorch-linux-jammy-py3-clang15-asan, + pytorch-linux-jammy-py3-clang18-asan, + pytorch-linux-jammy-py3-clang12-onnx, + pytorch-linux-jammy-linter, + pytorch-linux-jammy-cuda12.8-cudnn9-py3.9-linter, +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) pytorch-linux-jammy-py3-clang12-executorch, pytorch-linux-jammy-py3.12-triton-cpu ] include: - docker-image-name: pytorch-linux-jammy-aarch64-py3.10-gcc11 +<<<<<<< HEAD runner: linux.arm64.2xlarge +======= + runner: linux.arm64.m7g.4xlarge +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - docker-image-name: pytorch-linux-jammy-aarch64-py3.10-gcc11-inductor-benchmarks runner: linux.arm64.m7g.4xlarge timeout-minutes: 600 # Docker uploads fail from LF runners, see https://github.com/pytorch/pytorch/pull/137358 # runs-on: "${{ needs.get-label-type.outputs.label-type }}${{ matrix.runner }}" runs-on: "${{ matrix.runner }}" +<<<<<<< HEAD env: DOCKER_IMAGE_BASE: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/${{ matrix.docker-image-name }} +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) steps: - name: Clean workspace shell: bash @@ -99,32 +138,54 @@ jobs: # [see note: pytorch repo ref] # deep clone (fetch-depth 0) required for git merge-base - name: Checkout PyTorch +<<<<<<< HEAD uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.7 +======= + uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: Setup Linux uses: ./.github/actions/setup-linux - name: Build docker image id: build-docker-image +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.7 with: docker-image-name: ${{ matrix.docker-image-name }} +======= + uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.8 + with: + docker-image-name: ci-image:${{ matrix.docker-image-name }} +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) always-rebuild: true push: true - name: Pull docker image +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.7 with: docker-image: ${{ steps.build-docker-image.outputs.docker-image }} - uses: nick-fields/retry@v3.0.0 name: Push to https://https://ghcr.io/ +======= + uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.8 + with: + docker-image: ${{ steps.build-docker-image.outputs.docker-image }} + + - uses: nick-fields/retry@7152eba30c6575329ac0576536151aca5a72780e # v3.0.0 + name: Push to https://ghcr.io/ +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) id: push-to-ghcr-io if: ${{ github.event_name == 'push' }} env: ECR_DOCKER_IMAGE: ${{ steps.build-docker-image.outputs.docker-image }} GHCR_PAT: ${{ secrets.GHCR_PAT }} +<<<<<<< HEAD IMAGE_NAME: ${{ matrix.docker-image-name }} +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: shell: bash timeout_minutes: 30 @@ -135,8 +196,13 @@ jobs: tag=${ECR_DOCKER_IMAGE##*:} # Push docker image to the ghcr.io echo $GHCR_PAT | docker login ghcr.io -u pytorch --password-stdin +<<<<<<< HEAD docker tag "${ECR_DOCKER_IMAGE}" "${ghcr_image}:${IMAGE_NAME}-${tag}" docker push "${ghcr_image}:${IMAGE_NAME}-${tag}" +======= + docker tag "${ECR_DOCKER_IMAGE}" "${ghcr_image}:${tag}" + docker push "${ghcr_image}:${tag}" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: Chown workspace uses: ./.github/actions/chown-workspace @@ -145,5 +211,9 @@ jobs: if: always() - name: Teardown Linux +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/teardown-linux@release/2.7 +======= + uses: pytorch/test-infra/.github/actions/teardown-linux@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if: always() diff --git a/.github/workflows/docker-cache-mi300.yml b/.github/workflows/docker-cache-mi300.yml new file mode 100644 index 000000000000..065030dbf68b --- /dev/null +++ b/.github/workflows/docker-cache-mi300.yml @@ -0,0 +1,55 @@ +name: docker-cache-mi300 + +on: + # run every 6 hours + schedule: + - cron: 0 0,6,12,18 * * * + workflow_dispatch: + +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name }} + cancel-in-progress: true + +permissions: + id-token: write + contents: read + +jobs: + docker-cache: + if: github.repository_owner == 'pytorch' + runs-on: rocm-docker + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.8 + with: + no-sudo: true + + - name: configure aws credentials + id: aws_creds + uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0 + with: + role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only + aws-region: us-east-1 + role-duration-seconds: 18000 + + - name: Login to Amazon ECR + id: login-ecr + continue-on-error: false + uses: aws-actions/amazon-ecr-login@062b18b96a7aff071d4dc91bc00c4c1a7945b076 # v2.0.1 + + - name: Calculate docker image + id: calculate-docker-image + uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.8 + with: + docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3 + push: false + + - name: Pull docker image + uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.8 + with: + docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} + + - name: Tar and upload to S3 bucket + run: | + sudo docker save -o ~/docker-data/pytorch/pytorch_docker_image.tar ${{ steps.calculate-docker-image.outputs.docker-image }} + sudo rclone copy -P --s3-upload-concurrency 64 --s3-chunk-size 200M --s3-upload-cutoff 300M ~/docker-data/pytorch/pytorch_docker_image.tar oci:pytorchbucket0002/pytorch_docker_image --progress diff --git a/.github/workflows/docker-release.yml b/.github/workflows/docker-release.yml index fa8116f03109..7af30009ce08 100644 --- a/.github/workflows/docker-release.yml +++ b/.github/workflows/docker-release.yml @@ -37,7 +37,11 @@ jobs: get-label-type: if: github.repository_owner == 'pytorch' name: get-label-type +<<<<<<< HEAD uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.7 +======= + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: triggering_actor: ${{ github.triggering_actor }} issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} @@ -52,7 +56,11 @@ jobs: matrix: ${{ steps.generate-matrix.outputs.matrix }} steps: - name: Checkout PyTorch +<<<<<<< HEAD uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.7 +======= + uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: fetch-depth: 1 submodules: true @@ -82,13 +90,21 @@ jobs: CUDNN_VERSION: ${{ matrix.cudnn_version }} steps: - name: Setup SSH (Click me for login details) +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7 +======= + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: github-secret: ${{ secrets.GITHUB_TOKEN }} # [see note: pytorch repo ref] # deep clone (fetch-depth 0) required for git merge-base - name: Checkout PyTorch +<<<<<<< HEAD uses: actions/checkout@v3 +======= + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: fetch-depth: 0 submodules: 'recursive' @@ -96,18 +112,30 @@ jobs: uses: ./.github/actions/setup-linux - name: Login to GitHub Container Registry if: ${{ env.WITH_PUSH == 'true' }} +<<<<<<< HEAD uses: docker/login-action@v2 +======= + uses: docker/login-action@74a5d142397b4f367a81961eba4e8cd7edddf772 # v3.4.0 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: registry: ghcr.io username: pytorch password: ${{ secrets.GHCR_PAT }} # Setup multi-arch image builds - name: Set up QEMU +<<<<<<< HEAD uses: docker/setup-qemu-action@v3 env: QEMU_BINARY_PATH: ${{ runner.temp }}/bin - name: Set up Docker Buildx uses: docker/setup-buildx-action@v3 +======= + uses: docker/setup-qemu-action@29109295f81e9208d7d86ff1c6c12d2833863392 # v3.6.0 + env: + QEMU_BINARY_PATH: ${{ runner.temp }}/bin + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@b5ca514318bd6ebac0fb2aedd5d36ec1b5c232a2 # v3.10.0 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: version: latest driver-opts: image=moby/buildkit:v0.19.0 @@ -156,7 +184,11 @@ jobs: docker push ghcr.io/pytorch/pytorch-nightly:"${PYTORCH_NIGHTLY_COMMIT}${CUDA_SUFFIX}" +<<<<<<< HEAD # Please note, here we ned to pin specific verison of CUDA as with latest label +======= + # Please note, here we need to pin specific version of CUDA as with latest label +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if [[ ${CUDA_VERSION_SHORT} == "${STABLE_CUDA_VERSION}" ]]; then docker tag ghcr.io/pytorch/pytorch-nightly:"${PYTORCH_NIGHTLY_COMMIT}${CUDA_SUFFIX}" \ ghcr.io/pytorch/pytorch-nightly:latest @@ -164,12 +196,20 @@ jobs: fi - name: Teardown Linux +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/teardown-linux@release/2.7 +======= + uses: pytorch/test-infra/.github/actions/teardown-linux@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if: always() validate: needs: build +<<<<<<< HEAD uses: pytorch/test-infra/.github/workflows/validate-docker-images.yml@release/2.7 +======= + uses: pytorch/test-infra/.github/workflows/validate-docker-images.yml@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: channel: test ref: main diff --git a/.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml b/.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml index 108fff638950..8108bb233f05 100644 --- a/.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml +++ b/.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml @@ -18,6 +18,12 @@ on: - 'ciflow/binaries_wheel/*' workflow_dispatch: +<<<<<<< HEAD +======= +permissions: + id-token: write + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) env: # Needed for conda builds ALPINE_IMAGE: "arm64v8/alpine" @@ -38,7 +44,11 @@ jobs: get-label-type: if: github.repository_owner == 'pytorch' name: get-label-type +<<<<<<< HEAD uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.7 +======= + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: triggering_actor: ${{ github.triggering_actor }} issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} @@ -55,8 +65,13 @@ jobs: # favor of GPU_ARCH_VERSION DESIRED_CUDA: cpu GPU_ARCH_TYPE: cpu-aarch64 +<<<<<<< HEAD DOCKER_IMAGE: pytorch/manylinux2_28_aarch64-builder:cpu-aarch64-2.7 DESIRED_DEVTOOLSET: cxx11-abi +======= + DOCKER_IMAGE: manylinux2_28_aarch64-builder + DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) use_split_build: False DESIRED_PYTHON: "3.9" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" @@ -79,8 +94,13 @@ jobs: # favor of GPU_ARCH_VERSION DESIRED_CUDA: cpu GPU_ARCH_TYPE: cpu-aarch64 +<<<<<<< HEAD DOCKER_IMAGE: pytorch/manylinux2_28_aarch64-builder:cpu-aarch64-2.7 DESIRED_DEVTOOLSET: cxx11-abi +======= + DOCKER_IMAGE: manylinux2_28_aarch64-builder + DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) use_split_build: False DESIRED_PYTHON: "3.9" build_name: manywheel-py3_9-cpu-aarch64 @@ -103,8 +123,13 @@ jobs: # favor of GPU_ARCH_VERSION DESIRED_CUDA: cpu GPU_ARCH_TYPE: cpu-aarch64 +<<<<<<< HEAD DOCKER_IMAGE: pytorch/manylinux2_28_aarch64-builder:cpu-aarch64-2.7 DESIRED_DEVTOOLSET: cxx11-abi +======= + DOCKER_IMAGE: manylinux2_28_aarch64-builder + DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) use_split_build: False DESIRED_PYTHON: "3.9" build_name: manywheel-py3_9-cpu-aarch64 @@ -112,7 +137,11 @@ jobs: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml +<<<<<<< HEAD manywheel-py3_9-cuda-aarch64-12_8-build: +======= + manywheel-py3_9-cuda-aarch64-12_9-build: +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if: ${{ github.repository_owner == 'pytorch' }} uses: ./.github/workflows/_binary-build-linux.yml needs: get-label-type @@ -121,16 +150,25 @@ jobs: PACKAGE_TYPE: manywheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION +<<<<<<< HEAD DESIRED_CUDA: cu128 GPU_ARCH_VERSION: 12.8-aarch64 GPU_ARCH_TYPE: cuda-aarch64 DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cuda12.8-2.7 DESIRED_DEVTOOLSET: cxx11-abi +======= + DESIRED_CUDA: cu129 + GPU_ARCH_VERSION: 12.9-aarch64 + GPU_ARCH_TYPE: cuda-aarch64 + DOCKER_IMAGE: manylinuxaarch64-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.9 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) use_split_build: False DESIRED_PYTHON: "3.9" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" runs_on: linux.arm64.m7g.4xlarge.ephemeral ALPINE_IMAGE: "arm64v8/alpine" +<<<<<<< HEAD build_name: manywheel-py3_9-cuda-aarch64-12_8 build_environment: linux-aarch64-binary-manywheel PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.7.1.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.3.14; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.41; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.2.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.7.53; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.0.11; platform_system == 'Linux' and platform_machine == 'x86_64' @@ -138,16 +176,30 @@ jobs: secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_9-cuda-aarch64-12_8-upload: # Uploading +======= + build_name: manywheel-py3_9-cuda-aarch64-12_9 + build_environment: linux-aarch64-binary-manywheel + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64' + timeout-minutes: 420 + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + manywheel-py3_9-cuda-aarch64-12_9-upload: # Uploading +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if: ${{ github.repository_owner == 'pytorch' }} permissions: id-token: write contents: read +<<<<<<< HEAD needs: manywheel-py3_9-cuda-aarch64-12_8-build +======= + needs: manywheel-py3_9-cuda-aarch64-12_9-build +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: PYTORCH_ROOT: /pytorch PACKAGE_TYPE: manywheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION +<<<<<<< HEAD DESIRED_CUDA: cu128 GPU_ARCH_VERSION: 12.8-aarch64 GPU_ARCH_TYPE: cuda-aarch64 @@ -156,6 +208,16 @@ jobs: use_split_build: False DESIRED_PYTHON: "3.9" build_name: manywheel-py3_9-cuda-aarch64-12_8 +======= + DESIRED_CUDA: cu129 + GPU_ARCH_VERSION: 12.9-aarch64 + GPU_ARCH_TYPE: cuda-aarch64 + DOCKER_IMAGE: manylinuxaarch64-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.9 + use_split_build: False + DESIRED_PYTHON: "3.9" + build_name: manywheel-py3_9-cuda-aarch64-12_9 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml @@ -171,8 +233,13 @@ jobs: # favor of GPU_ARCH_VERSION DESIRED_CUDA: cpu GPU_ARCH_TYPE: cpu-aarch64 +<<<<<<< HEAD DOCKER_IMAGE: pytorch/manylinux2_28_aarch64-builder:cpu-aarch64-2.7 DESIRED_DEVTOOLSET: cxx11-abi +======= + DOCKER_IMAGE: manylinux2_28_aarch64-builder + DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) use_split_build: False DESIRED_PYTHON: "3.10" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" @@ -195,8 +262,13 @@ jobs: # favor of GPU_ARCH_VERSION DESIRED_CUDA: cpu GPU_ARCH_TYPE: cpu-aarch64 +<<<<<<< HEAD DOCKER_IMAGE: pytorch/manylinux2_28_aarch64-builder:cpu-aarch64-2.7 DESIRED_DEVTOOLSET: cxx11-abi +======= + DOCKER_IMAGE: manylinux2_28_aarch64-builder + DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) use_split_build: False DESIRED_PYTHON: "3.10" build_name: manywheel-py3_10-cpu-aarch64 @@ -219,8 +291,13 @@ jobs: # favor of GPU_ARCH_VERSION DESIRED_CUDA: cpu GPU_ARCH_TYPE: cpu-aarch64 +<<<<<<< HEAD DOCKER_IMAGE: pytorch/manylinux2_28_aarch64-builder:cpu-aarch64-2.7 DESIRED_DEVTOOLSET: cxx11-abi +======= + DOCKER_IMAGE: manylinux2_28_aarch64-builder + DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) use_split_build: False DESIRED_PYTHON: "3.10" build_name: manywheel-py3_10-cpu-aarch64 @@ -228,7 +305,11 @@ jobs: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml +<<<<<<< HEAD manywheel-py3_10-cuda-aarch64-12_8-build: +======= + manywheel-py3_10-cuda-aarch64-12_9-build: +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if: ${{ github.repository_owner == 'pytorch' }} uses: ./.github/workflows/_binary-build-linux.yml needs: get-label-type @@ -237,16 +318,25 @@ jobs: PACKAGE_TYPE: manywheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION +<<<<<<< HEAD DESIRED_CUDA: cu128 GPU_ARCH_VERSION: 12.8-aarch64 GPU_ARCH_TYPE: cuda-aarch64 DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cuda12.8-2.7 DESIRED_DEVTOOLSET: cxx11-abi +======= + DESIRED_CUDA: cu129 + GPU_ARCH_VERSION: 12.9-aarch64 + GPU_ARCH_TYPE: cuda-aarch64 + DOCKER_IMAGE: manylinuxaarch64-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.9 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) use_split_build: False DESIRED_PYTHON: "3.10" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" runs_on: linux.arm64.m7g.4xlarge.ephemeral ALPINE_IMAGE: "arm64v8/alpine" +<<<<<<< HEAD build_name: manywheel-py3_10-cuda-aarch64-12_8 build_environment: linux-aarch64-binary-manywheel PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.7.1.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.3.14; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.41; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.2.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.7.53; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.0.11; platform_system == 'Linux' and platform_machine == 'x86_64' @@ -254,16 +344,30 @@ jobs: secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_10-cuda-aarch64-12_8-upload: # Uploading +======= + build_name: manywheel-py3_10-cuda-aarch64-12_9 + build_environment: linux-aarch64-binary-manywheel + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64' + timeout-minutes: 420 + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + manywheel-py3_10-cuda-aarch64-12_9-upload: # Uploading +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if: ${{ github.repository_owner == 'pytorch' }} permissions: id-token: write contents: read +<<<<<<< HEAD needs: manywheel-py3_10-cuda-aarch64-12_8-build +======= + needs: manywheel-py3_10-cuda-aarch64-12_9-build +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: PYTORCH_ROOT: /pytorch PACKAGE_TYPE: manywheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION +<<<<<<< HEAD DESIRED_CUDA: cu128 GPU_ARCH_VERSION: 12.8-aarch64 GPU_ARCH_TYPE: cuda-aarch64 @@ -272,6 +376,16 @@ jobs: use_split_build: False DESIRED_PYTHON: "3.10" build_name: manywheel-py3_10-cuda-aarch64-12_8 +======= + DESIRED_CUDA: cu129 + GPU_ARCH_VERSION: 12.9-aarch64 + GPU_ARCH_TYPE: cuda-aarch64 + DOCKER_IMAGE: manylinuxaarch64-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.9 + use_split_build: False + DESIRED_PYTHON: "3.10" + build_name: manywheel-py3_10-cuda-aarch64-12_9 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml @@ -287,8 +401,13 @@ jobs: # favor of GPU_ARCH_VERSION DESIRED_CUDA: cpu GPU_ARCH_TYPE: cpu-aarch64 +<<<<<<< HEAD DOCKER_IMAGE: pytorch/manylinux2_28_aarch64-builder:cpu-aarch64-2.7 DESIRED_DEVTOOLSET: cxx11-abi +======= + DOCKER_IMAGE: manylinux2_28_aarch64-builder + DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) use_split_build: False DESIRED_PYTHON: "3.11" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" @@ -311,8 +430,13 @@ jobs: # favor of GPU_ARCH_VERSION DESIRED_CUDA: cpu GPU_ARCH_TYPE: cpu-aarch64 +<<<<<<< HEAD DOCKER_IMAGE: pytorch/manylinux2_28_aarch64-builder:cpu-aarch64-2.7 DESIRED_DEVTOOLSET: cxx11-abi +======= + DOCKER_IMAGE: manylinux2_28_aarch64-builder + DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) use_split_build: False DESIRED_PYTHON: "3.11" build_name: manywheel-py3_11-cpu-aarch64 @@ -335,8 +459,13 @@ jobs: # favor of GPU_ARCH_VERSION DESIRED_CUDA: cpu GPU_ARCH_TYPE: cpu-aarch64 +<<<<<<< HEAD DOCKER_IMAGE: pytorch/manylinux2_28_aarch64-builder:cpu-aarch64-2.7 DESIRED_DEVTOOLSET: cxx11-abi +======= + DOCKER_IMAGE: manylinux2_28_aarch64-builder + DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) use_split_build: False DESIRED_PYTHON: "3.11" build_name: manywheel-py3_11-cpu-aarch64 @@ -344,7 +473,11 @@ jobs: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml +<<<<<<< HEAD manywheel-py3_11-cuda-aarch64-12_8-build: +======= + manywheel-py3_11-cuda-aarch64-12_9-build: +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if: ${{ github.repository_owner == 'pytorch' }} uses: ./.github/workflows/_binary-build-linux.yml needs: get-label-type @@ -353,16 +486,25 @@ jobs: PACKAGE_TYPE: manywheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION +<<<<<<< HEAD DESIRED_CUDA: cu128 GPU_ARCH_VERSION: 12.8-aarch64 GPU_ARCH_TYPE: cuda-aarch64 DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cuda12.8-2.7 DESIRED_DEVTOOLSET: cxx11-abi +======= + DESIRED_CUDA: cu129 + GPU_ARCH_VERSION: 12.9-aarch64 + GPU_ARCH_TYPE: cuda-aarch64 + DOCKER_IMAGE: manylinuxaarch64-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.9 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) use_split_build: False DESIRED_PYTHON: "3.11" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" runs_on: linux.arm64.m7g.4xlarge.ephemeral ALPINE_IMAGE: "arm64v8/alpine" +<<<<<<< HEAD build_name: manywheel-py3_11-cuda-aarch64-12_8 build_environment: linux-aarch64-binary-manywheel PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.7.1.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.3.14; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.41; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.2.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.7.53; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.0.11; platform_system == 'Linux' and platform_machine == 'x86_64' @@ -370,16 +512,30 @@ jobs: secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_11-cuda-aarch64-12_8-upload: # Uploading +======= + build_name: manywheel-py3_11-cuda-aarch64-12_9 + build_environment: linux-aarch64-binary-manywheel + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64' + timeout-minutes: 420 + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + manywheel-py3_11-cuda-aarch64-12_9-upload: # Uploading +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if: ${{ github.repository_owner == 'pytorch' }} permissions: id-token: write contents: read +<<<<<<< HEAD needs: manywheel-py3_11-cuda-aarch64-12_8-build +======= + needs: manywheel-py3_11-cuda-aarch64-12_9-build +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: PYTORCH_ROOT: /pytorch PACKAGE_TYPE: manywheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION +<<<<<<< HEAD DESIRED_CUDA: cu128 GPU_ARCH_VERSION: 12.8-aarch64 GPU_ARCH_TYPE: cuda-aarch64 @@ -388,6 +544,16 @@ jobs: use_split_build: False DESIRED_PYTHON: "3.11" build_name: manywheel-py3_11-cuda-aarch64-12_8 +======= + DESIRED_CUDA: cu129 + GPU_ARCH_VERSION: 12.9-aarch64 + GPU_ARCH_TYPE: cuda-aarch64 + DOCKER_IMAGE: manylinuxaarch64-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.9 + use_split_build: False + DESIRED_PYTHON: "3.11" + build_name: manywheel-py3_11-cuda-aarch64-12_9 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml @@ -403,8 +569,13 @@ jobs: # favor of GPU_ARCH_VERSION DESIRED_CUDA: cpu GPU_ARCH_TYPE: cpu-aarch64 +<<<<<<< HEAD DOCKER_IMAGE: pytorch/manylinux2_28_aarch64-builder:cpu-aarch64-2.7 DESIRED_DEVTOOLSET: cxx11-abi +======= + DOCKER_IMAGE: manylinux2_28_aarch64-builder + DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) use_split_build: False DESIRED_PYTHON: "3.12" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" @@ -427,8 +598,13 @@ jobs: # favor of GPU_ARCH_VERSION DESIRED_CUDA: cpu GPU_ARCH_TYPE: cpu-aarch64 +<<<<<<< HEAD DOCKER_IMAGE: pytorch/manylinux2_28_aarch64-builder:cpu-aarch64-2.7 DESIRED_DEVTOOLSET: cxx11-abi +======= + DOCKER_IMAGE: manylinux2_28_aarch64-builder + DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) use_split_build: False DESIRED_PYTHON: "3.12" build_name: manywheel-py3_12-cpu-aarch64 @@ -451,8 +627,13 @@ jobs: # favor of GPU_ARCH_VERSION DESIRED_CUDA: cpu GPU_ARCH_TYPE: cpu-aarch64 +<<<<<<< HEAD DOCKER_IMAGE: pytorch/manylinux2_28_aarch64-builder:cpu-aarch64-2.7 DESIRED_DEVTOOLSET: cxx11-abi +======= + DOCKER_IMAGE: manylinux2_28_aarch64-builder + DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) use_split_build: False DESIRED_PYTHON: "3.12" build_name: manywheel-py3_12-cpu-aarch64 @@ -460,7 +641,11 @@ jobs: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml +<<<<<<< HEAD manywheel-py3_12-cuda-aarch64-12_8-build: +======= + manywheel-py3_12-cuda-aarch64-12_9-build: +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if: ${{ github.repository_owner == 'pytorch' }} uses: ./.github/workflows/_binary-build-linux.yml needs: get-label-type @@ -469,16 +654,25 @@ jobs: PACKAGE_TYPE: manywheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION +<<<<<<< HEAD DESIRED_CUDA: cu128 GPU_ARCH_VERSION: 12.8-aarch64 GPU_ARCH_TYPE: cuda-aarch64 DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cuda12.8-2.7 DESIRED_DEVTOOLSET: cxx11-abi +======= + DESIRED_CUDA: cu129 + GPU_ARCH_VERSION: 12.9-aarch64 + GPU_ARCH_TYPE: cuda-aarch64 + DOCKER_IMAGE: manylinuxaarch64-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.9 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) use_split_build: False DESIRED_PYTHON: "3.12" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" runs_on: linux.arm64.m7g.4xlarge.ephemeral ALPINE_IMAGE: "arm64v8/alpine" +<<<<<<< HEAD build_name: manywheel-py3_12-cuda-aarch64-12_8 build_environment: linux-aarch64-binary-manywheel PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.7.1.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.3.14; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.41; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.2.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.7.53; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.0.11; platform_system == 'Linux' and platform_machine == 'x86_64' @@ -486,16 +680,30 @@ jobs: secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_12-cuda-aarch64-12_8-upload: # Uploading +======= + build_name: manywheel-py3_12-cuda-aarch64-12_9 + build_environment: linux-aarch64-binary-manywheel + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64' + timeout-minutes: 420 + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + manywheel-py3_12-cuda-aarch64-12_9-upload: # Uploading +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if: ${{ github.repository_owner == 'pytorch' }} permissions: id-token: write contents: read +<<<<<<< HEAD needs: manywheel-py3_12-cuda-aarch64-12_8-build +======= + needs: manywheel-py3_12-cuda-aarch64-12_9-build +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: PYTORCH_ROOT: /pytorch PACKAGE_TYPE: manywheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION +<<<<<<< HEAD DESIRED_CUDA: cu128 GPU_ARCH_VERSION: 12.8-aarch64 GPU_ARCH_TYPE: cuda-aarch64 @@ -504,6 +712,16 @@ jobs: use_split_build: False DESIRED_PYTHON: "3.12" build_name: manywheel-py3_12-cuda-aarch64-12_8 +======= + DESIRED_CUDA: cu129 + GPU_ARCH_VERSION: 12.9-aarch64 + GPU_ARCH_TYPE: cuda-aarch64 + DOCKER_IMAGE: manylinuxaarch64-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.9 + use_split_build: False + DESIRED_PYTHON: "3.12" + build_name: manywheel-py3_12-cuda-aarch64-12_9 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml @@ -519,8 +737,13 @@ jobs: # favor of GPU_ARCH_VERSION DESIRED_CUDA: cpu GPU_ARCH_TYPE: cpu-aarch64 +<<<<<<< HEAD DOCKER_IMAGE: pytorch/manylinux2_28_aarch64-builder:cpu-aarch64-2.7 DESIRED_DEVTOOLSET: cxx11-abi +======= + DOCKER_IMAGE: manylinux2_28_aarch64-builder + DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) use_split_build: False DESIRED_PYTHON: "3.13" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" @@ -543,8 +766,13 @@ jobs: # favor of GPU_ARCH_VERSION DESIRED_CUDA: cpu GPU_ARCH_TYPE: cpu-aarch64 +<<<<<<< HEAD DOCKER_IMAGE: pytorch/manylinux2_28_aarch64-builder:cpu-aarch64-2.7 DESIRED_DEVTOOLSET: cxx11-abi +======= + DOCKER_IMAGE: manylinux2_28_aarch64-builder + DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) use_split_build: False DESIRED_PYTHON: "3.13" build_name: manywheel-py3_13-cpu-aarch64 @@ -567,8 +795,13 @@ jobs: # favor of GPU_ARCH_VERSION DESIRED_CUDA: cpu GPU_ARCH_TYPE: cpu-aarch64 +<<<<<<< HEAD DOCKER_IMAGE: pytorch/manylinux2_28_aarch64-builder:cpu-aarch64-2.7 DESIRED_DEVTOOLSET: cxx11-abi +======= + DOCKER_IMAGE: manylinux2_28_aarch64-builder + DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) use_split_build: False DESIRED_PYTHON: "3.13" build_name: manywheel-py3_13-cpu-aarch64 @@ -576,7 +809,11 @@ jobs: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml +<<<<<<< HEAD manywheel-py3_13-cuda-aarch64-12_8-build: +======= + manywheel-py3_13-cuda-aarch64-12_9-build: +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if: ${{ github.repository_owner == 'pytorch' }} uses: ./.github/workflows/_binary-build-linux.yml needs: get-label-type @@ -585,16 +822,25 @@ jobs: PACKAGE_TYPE: manywheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION +<<<<<<< HEAD DESIRED_CUDA: cu128 GPU_ARCH_VERSION: 12.8-aarch64 GPU_ARCH_TYPE: cuda-aarch64 DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cuda12.8-2.7 DESIRED_DEVTOOLSET: cxx11-abi +======= + DESIRED_CUDA: cu129 + GPU_ARCH_VERSION: 12.9-aarch64 + GPU_ARCH_TYPE: cuda-aarch64 + DOCKER_IMAGE: manylinuxaarch64-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.9 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) use_split_build: False DESIRED_PYTHON: "3.13" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" runs_on: linux.arm64.m7g.4xlarge.ephemeral ALPINE_IMAGE: "arm64v8/alpine" +<<<<<<< HEAD build_name: manywheel-py3_13-cuda-aarch64-12_8 build_environment: linux-aarch64-binary-manywheel PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.7.1.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.3.14; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.41; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.2.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.7.53; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.0.11; platform_system == 'Linux' and platform_machine == 'x86_64' @@ -602,16 +848,30 @@ jobs: secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_13-cuda-aarch64-12_8-upload: # Uploading +======= + build_name: manywheel-py3_13-cuda-aarch64-12_9 + build_environment: linux-aarch64-binary-manywheel + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64' + timeout-minutes: 420 + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + manywheel-py3_13-cuda-aarch64-12_9-upload: # Uploading +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if: ${{ github.repository_owner == 'pytorch' }} permissions: id-token: write contents: read +<<<<<<< HEAD needs: manywheel-py3_13-cuda-aarch64-12_8-build +======= + needs: manywheel-py3_13-cuda-aarch64-12_9-build +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: PYTORCH_ROOT: /pytorch PACKAGE_TYPE: manywheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION +<<<<<<< HEAD DESIRED_CUDA: cu128 GPU_ARCH_VERSION: 12.8-aarch64 GPU_ARCH_TYPE: cuda-aarch64 @@ -620,6 +880,16 @@ jobs: use_split_build: False DESIRED_PYTHON: "3.13" build_name: manywheel-py3_13-cuda-aarch64-12_8 +======= + DESIRED_CUDA: cu129 + GPU_ARCH_VERSION: 12.9-aarch64 + GPU_ARCH_TYPE: cuda-aarch64 + DOCKER_IMAGE: manylinuxaarch64-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.9 + use_split_build: False + DESIRED_PYTHON: "3.13" + build_name: manywheel-py3_13-cuda-aarch64-12_9 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml @@ -635,8 +905,13 @@ jobs: # favor of GPU_ARCH_VERSION DESIRED_CUDA: cpu GPU_ARCH_TYPE: cpu-aarch64 +<<<<<<< HEAD DOCKER_IMAGE: pytorch/manylinux2_28_aarch64-builder:cpu-aarch64-2.7 DESIRED_DEVTOOLSET: cxx11-abi +======= + DOCKER_IMAGE: manylinux2_28_aarch64-builder + DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) use_split_build: False DESIRED_PYTHON: "3.13t" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" @@ -659,8 +934,13 @@ jobs: # favor of GPU_ARCH_VERSION DESIRED_CUDA: cpu GPU_ARCH_TYPE: cpu-aarch64 +<<<<<<< HEAD DOCKER_IMAGE: pytorch/manylinux2_28_aarch64-builder:cpu-aarch64-2.7 DESIRED_DEVTOOLSET: cxx11-abi +======= + DOCKER_IMAGE: manylinux2_28_aarch64-builder + DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) use_split_build: False DESIRED_PYTHON: "3.13t" build_name: manywheel-py3_13t-cpu-aarch64 @@ -683,8 +963,13 @@ jobs: # favor of GPU_ARCH_VERSION DESIRED_CUDA: cpu GPU_ARCH_TYPE: cpu-aarch64 +<<<<<<< HEAD DOCKER_IMAGE: pytorch/manylinux2_28_aarch64-builder:cpu-aarch64-2.7 DESIRED_DEVTOOLSET: cxx11-abi +======= + DOCKER_IMAGE: manylinux2_28_aarch64-builder + DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) use_split_build: False DESIRED_PYTHON: "3.13t" build_name: manywheel-py3_13t-cpu-aarch64 @@ -692,7 +977,11 @@ jobs: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml +<<<<<<< HEAD manywheel-py3_13t-cuda-aarch64-12_8-build: +======= + manywheel-py3_13t-cuda-aarch64-12_9-build: +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if: ${{ github.repository_owner == 'pytorch' }} uses: ./.github/workflows/_binary-build-linux.yml needs: get-label-type @@ -701,16 +990,25 @@ jobs: PACKAGE_TYPE: manywheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION +<<<<<<< HEAD DESIRED_CUDA: cu128 GPU_ARCH_VERSION: 12.8-aarch64 GPU_ARCH_TYPE: cuda-aarch64 DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cuda12.8-2.7 DESIRED_DEVTOOLSET: cxx11-abi +======= + DESIRED_CUDA: cu129 + GPU_ARCH_VERSION: 12.9-aarch64 + GPU_ARCH_TYPE: cuda-aarch64 + DOCKER_IMAGE: manylinuxaarch64-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.9 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) use_split_build: False DESIRED_PYTHON: "3.13t" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" runs_on: linux.arm64.m7g.4xlarge.ephemeral ALPINE_IMAGE: "arm64v8/alpine" +<<<<<<< HEAD build_name: manywheel-py3_13t-cuda-aarch64-12_8 build_environment: linux-aarch64-binary-manywheel PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.7.1.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.3.14; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.41; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.2.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.7.53; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.0.11; platform_system == 'Linux' and platform_machine == 'x86_64' @@ -718,16 +1016,30 @@ jobs: secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_13t-cuda-aarch64-12_8-upload: # Uploading +======= + build_name: manywheel-py3_13t-cuda-aarch64-12_9 + build_environment: linux-aarch64-binary-manywheel + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64' + timeout-minutes: 420 + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + manywheel-py3_13t-cuda-aarch64-12_9-upload: # Uploading +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if: ${{ github.repository_owner == 'pytorch' }} permissions: id-token: write contents: read +<<<<<<< HEAD needs: manywheel-py3_13t-cuda-aarch64-12_8-build +======= + needs: manywheel-py3_13t-cuda-aarch64-12_9-build +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: PYTORCH_ROOT: /pytorch PACKAGE_TYPE: manywheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION +<<<<<<< HEAD DESIRED_CUDA: cu128 GPU_ARCH_VERSION: 12.8-aarch64 GPU_ARCH_TYPE: cuda-aarch64 @@ -736,6 +1048,16 @@ jobs: use_split_build: False DESIRED_PYTHON: "3.13t" build_name: manywheel-py3_13t-cuda-aarch64-12_8 +======= + DESIRED_CUDA: cu129 + GPU_ARCH_VERSION: 12.9-aarch64 + GPU_ARCH_TYPE: cuda-aarch64 + DOCKER_IMAGE: manylinuxaarch64-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.9 + use_split_build: False + DESIRED_PYTHON: "3.13t" + build_name: manywheel-py3_13t-cuda-aarch64-12_9 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml diff --git a/.github/workflows/generated-linux-binary-libtorch-nightly.yml b/.github/workflows/generated-linux-binary-libtorch-nightly.yml new file mode 100644 index 000000000000..f7aa0ba20236 --- /dev/null +++ b/.github/workflows/generated-linux-binary-libtorch-nightly.yml @@ -0,0 +1,543 @@ +# @generated DO NOT EDIT MANUALLY + +# Template is at: .github/templates/linux_binary_build_workflow.yml.j2 +# Generation script: .github/scripts/generate_ci_workflows.py +name: linux-binary-libtorch + + +on: + push: + # NOTE: Meta Employees can trigger new nightlies using: https://fburl.com/trigger_pytorch_nightly_build + branches: + - nightly + tags: + # NOTE: Binary build pipelines should only get triggered on release candidate builds + # Release candidate tags look like: v1.11.0-rc1 + - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+ + - 'ciflow/binaries/*' + - 'ciflow/binaries_libtorch/*' + workflow_dispatch: + +permissions: + id-token: write + +env: + # Needed for conda builds + ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine" + AWS_DEFAULT_REGION: us-east-1 + BINARY_ENV_FILE: /tmp/env + BUILD_ENVIRONMENT: linux-binary-libtorch + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + PR_NUMBER: ${{ github.event.pull_request.number }} + PYTORCH_FINAL_PACKAGE_DIR: /artifacts + PYTORCH_ROOT: /pytorch + SHA1: ${{ github.event.pull_request.head.sha || github.sha }} + SKIP_ALL_TESTS: 0 +concurrency: + group: linux-binary-libtorch-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }} + cancel-in-progress: true + +jobs: + get-label-type: + if: github.repository_owner == 'pytorch' + name: get-label-type + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8 + with: + triggering_actor: ${{ github.triggering_actor }} + issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} + curr_branch: ${{ github.head_ref || github.ref_name }} + curr_ref_type: ${{ github.ref_type }} + libtorch-cpu-shared-with-deps-release-build: + if: ${{ github.repository_owner == 'pytorch' }} + uses: ./.github/workflows/_binary-build-linux.yml + needs: get-label-type + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + DOCKER_IMAGE: libtorch-cxx11-builder + DOCKER_IMAGE_TAG_PREFIX: cpu + LIBTORCH_CONFIG: release + LIBTORCH_VARIANT: shared-with-deps + runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + build_name: libtorch-cpu-shared-with-deps-release + build_environment: linux-binary-libtorch + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + libtorch-cpu-shared-with-deps-release-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: + - libtorch-cpu-shared-with-deps-release-build + - get-label-type + uses: ./.github/workflows/_binary-test-linux.yml + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + DOCKER_IMAGE: libtorch-cxx11-builder + DOCKER_IMAGE_TAG_PREFIX: cpu + LIBTORCH_CONFIG: release + LIBTORCH_VARIANT: shared-with-deps + build_name: libtorch-cpu-shared-with-deps-release + build_environment: linux-binary-libtorch + runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + runs_on: linux.4xlarge + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + libtorch-cpu-shared-with-deps-release-upload: # Uploading + if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read + needs: libtorch-cpu-shared-with-deps-release-test + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + DOCKER_IMAGE: libtorch-cxx11-builder + DOCKER_IMAGE_TAG_PREFIX: cpu + LIBTORCH_CONFIG: release + LIBTORCH_VARIANT: shared-with-deps + build_name: libtorch-cpu-shared-with-deps-release + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + uses: ./.github/workflows/_binary-upload.yml + + libtorch-cuda12_6-shared-with-deps-release-build: + if: ${{ github.repository_owner == 'pytorch' }} + uses: ./.github/workflows/_binary-build-linux.yml + needs: get-label-type + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu126 + GPU_ARCH_VERSION: 12.6 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: libtorch-cxx11-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.6 + LIBTORCH_CONFIG: release + LIBTORCH_VARIANT: shared-with-deps + runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + build_name: libtorch-cuda12_6-shared-with-deps-release + build_environment: linux-binary-libtorch + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + libtorch-cuda12_6-shared-with-deps-release-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: + - libtorch-cuda12_6-shared-with-deps-release-build + - get-label-type + uses: ./.github/workflows/_binary-test-linux.yml + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu126 + GPU_ARCH_VERSION: 12.6 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: libtorch-cxx11-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.6 + LIBTORCH_CONFIG: release + LIBTORCH_VARIANT: shared-with-deps + build_name: libtorch-cuda12_6-shared-with-deps-release + build_environment: linux-binary-libtorch + runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + runs_on: linux.4xlarge.nvidia.gpu # for other cuda versions, we use 4xlarge runner + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + libtorch-cuda12_6-shared-with-deps-release-upload: # Uploading + if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read + needs: libtorch-cuda12_6-shared-with-deps-release-test + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu126 + GPU_ARCH_VERSION: 12.6 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: libtorch-cxx11-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.6 + LIBTORCH_CONFIG: release + LIBTORCH_VARIANT: shared-with-deps + build_name: libtorch-cuda12_6-shared-with-deps-release + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + uses: ./.github/workflows/_binary-upload.yml + + libtorch-cuda12_8-shared-with-deps-release-build: + if: ${{ github.repository_owner == 'pytorch' }} + uses: ./.github/workflows/_binary-build-linux.yml + needs: get-label-type + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu128 + GPU_ARCH_VERSION: 12.8 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: libtorch-cxx11-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.8 + LIBTORCH_CONFIG: release + LIBTORCH_VARIANT: shared-with-deps + runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + build_name: libtorch-cuda12_8-shared-with-deps-release + build_environment: linux-binary-libtorch + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + libtorch-cuda12_8-shared-with-deps-release-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: + - libtorch-cuda12_8-shared-with-deps-release-build + - get-label-type + uses: ./.github/workflows/_binary-test-linux.yml + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu128 + GPU_ARCH_VERSION: 12.8 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: libtorch-cxx11-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.8 + LIBTORCH_CONFIG: release + LIBTORCH_VARIANT: shared-with-deps + build_name: libtorch-cuda12_8-shared-with-deps-release + build_environment: linux-binary-libtorch + runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8 and 12.9 build need sm_70+ runner + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + libtorch-cuda12_8-shared-with-deps-release-upload: # Uploading + if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read + needs: libtorch-cuda12_8-shared-with-deps-release-test + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu128 + GPU_ARCH_VERSION: 12.8 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: libtorch-cxx11-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.8 + LIBTORCH_CONFIG: release + LIBTORCH_VARIANT: shared-with-deps + build_name: libtorch-cuda12_8-shared-with-deps-release + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + uses: ./.github/workflows/_binary-upload.yml + + libtorch-cuda12_9-shared-with-deps-release-build: + if: ${{ github.repository_owner == 'pytorch' }} + uses: ./.github/workflows/_binary-build-linux.yml + needs: get-label-type + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu129 + GPU_ARCH_VERSION: 12.9 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: libtorch-cxx11-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.9 + LIBTORCH_CONFIG: release + LIBTORCH_VARIANT: shared-with-deps + runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + build_name: libtorch-cuda12_9-shared-with-deps-release + build_environment: linux-binary-libtorch + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + libtorch-cuda12_9-shared-with-deps-release-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: + - libtorch-cuda12_9-shared-with-deps-release-build + - get-label-type + uses: ./.github/workflows/_binary-test-linux.yml + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu129 + GPU_ARCH_VERSION: 12.9 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: libtorch-cxx11-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.9 + LIBTORCH_CONFIG: release + LIBTORCH_VARIANT: shared-with-deps + build_name: libtorch-cuda12_9-shared-with-deps-release + build_environment: linux-binary-libtorch + runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8 and 12.9 build need sm_70+ runner + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + libtorch-cuda12_9-shared-with-deps-release-upload: # Uploading + if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read + needs: libtorch-cuda12_9-shared-with-deps-release-test + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu129 + GPU_ARCH_VERSION: 12.9 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: libtorch-cxx11-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.9 + LIBTORCH_CONFIG: release + LIBTORCH_VARIANT: shared-with-deps + build_name: libtorch-cuda12_9-shared-with-deps-release + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + uses: ./.github/workflows/_binary-upload.yml + + libtorch-rocm6_3-shared-with-deps-release-build: + if: ${{ github.repository_owner == 'pytorch' }} + uses: ./.github/workflows/_binary-build-linux.yml + needs: get-label-type + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: rocm6.3 + GPU_ARCH_VERSION: 6.3 + GPU_ARCH_TYPE: rocm + DOCKER_IMAGE: libtorch-cxx11-builder + DOCKER_IMAGE_TAG_PREFIX: rocm6.3 + LIBTORCH_CONFIG: release + LIBTORCH_VARIANT: shared-with-deps + runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + build_name: libtorch-rocm6_3-shared-with-deps-release + build_environment: linux-binary-libtorch + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + libtorch-rocm6_3-shared-with-deps-release-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: + - libtorch-rocm6_3-shared-with-deps-release-build + - get-label-type + runs-on: linux.rocm.gpu.mi250 + timeout-minutes: 240 + env: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: rocm6.3 + GPU_ARCH_VERSION: 6.3 + GPU_ARCH_TYPE: rocm + SKIP_ALL_TESTS: 1 + DOCKER_IMAGE: libtorch-cxx11-builder + DOCKER_IMAGE_TAG_PREFIX: rocm6.3 + LIBTORCH_CONFIG: release + LIBTORCH_VARIANT: shared-with-deps + steps: + - name: Setup ROCm + uses: ./.github/actions/setup-rocm + - uses: actions/download-artifact@v4.1.7 + name: Download Build Artifacts + with: + name: libtorch-rocm6_3-shared-with-deps-release + path: "${{ runner.temp }}/artifacts/" + - name: Checkout PyTorch + uses: actions/checkout@v4 + with: + submodules: recursive + path: pytorch + show-progress: false + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: ROCm set GPU_FLAG + run: | + echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}" + - name: configure aws credentials + id: aws_creds + if: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') }} + uses: aws-actions/configure-aws-credentials@v4 + with: + role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only + aws-region: us-east-1 + role-duration-seconds: 18000 + - name: Calculate docker image + id: calculate-docker-image + uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.8 + with: + docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }} + docker-image-name: libtorch-cxx11-builder + custom-tag-prefix: rocm6.3 + docker-build-dir: .ci/docker + working-directory: pytorch + - name: Pull Docker image + uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.8 + with: + docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} + - name: Test Pytorch binary + uses: ./pytorch/.github/actions/test-pytorch-binary + env: + DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }} + - name: Teardown ROCm + uses: ./.github/actions/teardown-rocm + libtorch-rocm6_3-shared-with-deps-release-upload: # Uploading + if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read + needs: libtorch-rocm6_3-shared-with-deps-release-test + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: rocm6.3 + GPU_ARCH_VERSION: 6.3 + GPU_ARCH_TYPE: rocm + DOCKER_IMAGE: libtorch-cxx11-builder + DOCKER_IMAGE_TAG_PREFIX: rocm6.3 + LIBTORCH_CONFIG: release + LIBTORCH_VARIANT: shared-with-deps + build_name: libtorch-rocm6_3-shared-with-deps-release + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + uses: ./.github/workflows/_binary-upload.yml + + libtorch-rocm6_4-shared-with-deps-release-build: + if: ${{ github.repository_owner == 'pytorch' }} + uses: ./.github/workflows/_binary-build-linux.yml + needs: get-label-type + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: rocm6.4 + GPU_ARCH_VERSION: 6.4 + GPU_ARCH_TYPE: rocm + DOCKER_IMAGE: libtorch-cxx11-builder + DOCKER_IMAGE_TAG_PREFIX: rocm6.4 + LIBTORCH_CONFIG: release + LIBTORCH_VARIANT: shared-with-deps + runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + build_name: libtorch-rocm6_4-shared-with-deps-release + build_environment: linux-binary-libtorch + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + libtorch-rocm6_4-shared-with-deps-release-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: + - libtorch-rocm6_4-shared-with-deps-release-build + - get-label-type + runs-on: linux.rocm.gpu.mi250 + timeout-minutes: 240 + env: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: rocm6.4 + GPU_ARCH_VERSION: 6.4 + GPU_ARCH_TYPE: rocm + SKIP_ALL_TESTS: 1 + DOCKER_IMAGE: libtorch-cxx11-builder + DOCKER_IMAGE_TAG_PREFIX: rocm6.4 + LIBTORCH_CONFIG: release + LIBTORCH_VARIANT: shared-with-deps + steps: + - name: Setup ROCm + uses: ./.github/actions/setup-rocm + - uses: actions/download-artifact@v4.1.7 + name: Download Build Artifacts + with: + name: libtorch-rocm6_4-shared-with-deps-release + path: "${{ runner.temp }}/artifacts/" + - name: Checkout PyTorch + uses: actions/checkout@v4 + with: + submodules: recursive + path: pytorch + show-progress: false + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: ROCm set GPU_FLAG + run: | + echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}" + - name: configure aws credentials + id: aws_creds + if: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') }} + uses: aws-actions/configure-aws-credentials@v4 + with: + role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only + aws-region: us-east-1 + role-duration-seconds: 18000 + - name: Calculate docker image + id: calculate-docker-image + uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.8 + with: + docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }} + docker-image-name: libtorch-cxx11-builder + custom-tag-prefix: rocm6.4 + docker-build-dir: .ci/docker + working-directory: pytorch + - name: Pull Docker image + uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.8 + with: + docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} + - name: Test Pytorch binary + uses: ./pytorch/.github/actions/test-pytorch-binary + env: + DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }} + - name: Teardown ROCm + uses: ./.github/actions/teardown-rocm + libtorch-rocm6_4-shared-with-deps-release-upload: # Uploading + if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read + needs: libtorch-rocm6_4-shared-with-deps-release-test + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: rocm6.4 + GPU_ARCH_VERSION: 6.4 + GPU_ARCH_TYPE: rocm + DOCKER_IMAGE: libtorch-cxx11-builder + DOCKER_IMAGE_TAG_PREFIX: rocm6.4 + LIBTORCH_CONFIG: release + LIBTORCH_VARIANT: shared-with-deps + build_name: libtorch-rocm6_4-shared-with-deps-release + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + uses: ./.github/workflows/_binary-upload.yml diff --git a/.github/workflows/generated-linux-binary-libtorch-release-main.yml b/.github/workflows/generated-linux-binary-libtorch-release-main.yml new file mode 100644 index 000000000000..1b231ca5ffb6 --- /dev/null +++ b/.github/workflows/generated-linux-binary-libtorch-release-main.yml @@ -0,0 +1,87 @@ +# @generated DO NOT EDIT MANUALLY + +# Template is at: .github/templates/linux_binary_build_workflow.yml.j2 +# Generation script: .github/scripts/generate_ci_workflows.py +name: linux-binary-libtorch-release + + +on: + push: + branches: + - main + tags: + - 'ciflow/trunk/*' + workflow_dispatch: + +permissions: + id-token: write + +env: + # Needed for conda builds + ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine" + AWS_DEFAULT_REGION: us-east-1 + BINARY_ENV_FILE: /tmp/env + BUILD_ENVIRONMENT: linux-binary-libtorch-release + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + PR_NUMBER: ${{ github.event.pull_request.number }} + PYTORCH_FINAL_PACKAGE_DIR: /artifacts + PYTORCH_ROOT: /pytorch + SHA1: ${{ github.event.pull_request.head.sha || github.sha }} + SKIP_ALL_TESTS: 0 +concurrency: + group: linux-binary-libtorch-release-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }} + cancel-in-progress: true + +jobs: + get-label-type: + if: github.repository_owner == 'pytorch' + name: get-label-type + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8 + with: + triggering_actor: ${{ github.triggering_actor }} + issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} + curr_branch: ${{ github.head_ref || github.ref_name }} + curr_ref_type: ${{ github.ref_type }} + libtorch-cpu-shared-with-deps-release-build: + if: ${{ github.repository_owner == 'pytorch' }} + uses: ./.github/workflows/_binary-build-linux.yml + needs: get-label-type + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + DOCKER_IMAGE: libtorch-cxx11-builder + DOCKER_IMAGE_TAG_PREFIX: cpu + LIBTORCH_CONFIG: release + LIBTORCH_VARIANT: shared-with-deps + runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + build_name: libtorch-cpu-shared-with-deps-release + build_environment: linux-binary-libtorch-release + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + libtorch-cpu-shared-with-deps-release-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: + - libtorch-cpu-shared-with-deps-release-build + - get-label-type + uses: ./.github/workflows/_binary-test-linux.yml + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + DOCKER_IMAGE: libtorch-cxx11-builder + DOCKER_IMAGE_TAG_PREFIX: cpu + LIBTORCH_CONFIG: release + LIBTORCH_VARIANT: shared-with-deps + build_name: libtorch-cpu-shared-with-deps-release + build_environment: linux-binary-libtorch-release + runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + runs_on: linux.4xlarge + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/generated-linux-binary-manywheel-main.yml b/.github/workflows/generated-linux-binary-manywheel-main.yml index 524d7dca0c77..60725edae2b9 100644 --- a/.github/workflows/generated-linux-binary-manywheel-main.yml +++ b/.github/workflows/generated-linux-binary-manywheel-main.yml @@ -13,6 +13,12 @@ on: - 'ciflow/trunk/*' workflow_dispatch: +<<<<<<< HEAD +======= +permissions: + id-token: write + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) env: # Needed for conda builds ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine" @@ -33,12 +39,17 @@ jobs: get-label-type: if: github.repository_owner == 'pytorch' name: get-label-type +<<<<<<< HEAD uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.7 +======= + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: triggering_actor: ${{ github.triggering_actor }} issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} curr_branch: ${{ github.head_ref || github.ref_name }} curr_ref_type: ${{ github.ref_type }} +<<<<<<< HEAD manywheel-py3_9-cuda11_8-build: if: ${{ github.repository_owner == 'pytorch' }} uses: ./.github/workflows/_binary-build-linux.yml @@ -86,6 +97,8 @@ jobs: secrets: github-token: ${{ secrets.GITHUB_TOKEN }} +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) manywheel-py3_9-cuda12_6-build: if: ${{ github.repository_owner == 'pytorch' }} uses: ./.github/workflows/_binary-build-linux.yml @@ -98,14 +111,23 @@ jobs: DESIRED_CUDA: cu126 GPU_ARCH_VERSION: 12.6 GPU_ARCH_TYPE: cuda +<<<<<<< HEAD DOCKER_IMAGE: pytorch/manylinux2_28-builder:cuda12.6-2.7 DESIRED_DEVTOOLSET: cxx11-abi +======= + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.6 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) use_split_build: False DESIRED_PYTHON: "3.9" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build_name: manywheel-py3_9-cuda12_6 build_environment: linux-binary-manywheel +<<<<<<< HEAD PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64' +======= + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64' +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_9-cuda12_6-test: # Testing @@ -122,14 +144,23 @@ jobs: DESIRED_CUDA: cu126 GPU_ARCH_VERSION: 12.6 GPU_ARCH_TYPE: cuda +<<<<<<< HEAD DOCKER_IMAGE: pytorch/manylinux2_28-builder:cuda12.6-2.7 DESIRED_DEVTOOLSET: cxx11-abi +======= + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.6 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) use_split_build: False DESIRED_PYTHON: "3.9" build_name: manywheel-py3_9-cuda12_6 build_environment: linux-binary-manywheel runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" +<<<<<<< HEAD runs_on: linux.4xlarge.nvidia.gpu +======= + runs_on: linux.4xlarge.nvidia.gpu # for other cuda versions, we use 4xlarge runner +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) secrets: github-token: ${{ secrets.GITHUB_TOKEN }} @@ -145,14 +176,23 @@ jobs: DESIRED_CUDA: cu128 GPU_ARCH_VERSION: 12.8 GPU_ARCH_TYPE: cuda +<<<<<<< HEAD DOCKER_IMAGE: pytorch/manylinux2_28-builder:cuda12.8-2.7 DESIRED_DEVTOOLSET: cxx11-abi +======= + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) use_split_build: False DESIRED_PYTHON: "3.9" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build_name: manywheel-py3_9-cuda12_8 build_environment: linux-binary-manywheel +<<<<<<< HEAD PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.7.1.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.3.14; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.41; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.2.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.7.53; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.0.11; platform_system == 'Linux' and platform_machine == 'x86_64' +======= + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_9-cuda12_8-test: # Testing @@ -169,13 +209,162 @@ jobs: DESIRED_CUDA: cu128 GPU_ARCH_VERSION: 12.8 GPU_ARCH_TYPE: cuda +<<<<<<< HEAD DOCKER_IMAGE: pytorch/manylinux2_28-builder:cuda12.8-2.7 DESIRED_DEVTOOLSET: cxx11-abi +======= + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) use_split_build: False DESIRED_PYTHON: "3.9" build_name: manywheel-py3_9-cuda12_8 build_environment: linux-binary-manywheel runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" +<<<<<<< HEAD runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8 build needs sm_70+ runner secrets: github-token: ${{ secrets.GITHUB_TOKEN }} +======= + runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8 and 12.9 build need sm_70+ runner + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + + manywheel-py3_9-cuda12_9-build: + if: ${{ github.repository_owner == 'pytorch' }} + uses: ./.github/workflows/_binary-build-linux.yml + needs: get-label-type + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu129 + GPU_ARCH_VERSION: 12.9 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.9 + use_split_build: False + DESIRED_PYTHON: "3.9" + runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + build_name: manywheel-py3_9-cuda12_9 + build_environment: linux-binary-manywheel + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64' + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + manywheel-py3_9-cuda12_9-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: + - manywheel-py3_9-cuda12_9-build + - get-label-type + uses: ./.github/workflows/_binary-test-linux.yml + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu129 + GPU_ARCH_VERSION: 12.9 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.9 + use_split_build: False + DESIRED_PYTHON: "3.9" + build_name: manywheel-py3_9-cuda12_9 + build_environment: linux-binary-manywheel + runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8 and 12.9 build need sm_70+ runner + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + + manywheel-py3_9-rocm6_4-build: + if: ${{ github.repository_owner == 'pytorch' }} + uses: ./.github/workflows/_binary-build-linux.yml + needs: get-label-type + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: rocm6.4 + GPU_ARCH_VERSION: 6.4 + GPU_ARCH_TYPE: rocm + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: rocm6.4 + use_split_build: False + DESIRED_PYTHON: "3.9" + runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + build_name: manywheel-py3_9-rocm6_4 + build_environment: linux-binary-manywheel + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + manywheel-py3_9-rocm6_4-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: + - manywheel-py3_9-rocm6_4-build + - get-label-type + runs-on: linux.rocm.gpu.mi250 + timeout-minutes: 240 + env: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: rocm6.4 + GPU_ARCH_VERSION: 6.4 + GPU_ARCH_TYPE: rocm + SKIP_ALL_TESTS: 1 + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: rocm6.4 + use_split_build: False + DESIRED_PYTHON: "3.9" + steps: + - name: Setup ROCm + uses: ./.github/actions/setup-rocm + - uses: actions/download-artifact@v4.1.7 + name: Download Build Artifacts + with: + name: manywheel-py3_9-rocm6_4 + path: "${{ runner.temp }}/artifacts/" + - name: Checkout PyTorch + uses: actions/checkout@v4 + with: + submodules: recursive + path: pytorch + show-progress: false + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: ROCm set GPU_FLAG + run: | + echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}" + - name: configure aws credentials + id: aws_creds + if: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') }} + uses: aws-actions/configure-aws-credentials@v4 + with: + role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only + aws-region: us-east-1 + role-duration-seconds: 18000 + - name: Calculate docker image + id: calculate-docker-image + uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.8 + with: + docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }} + docker-image-name: manylinux2_28-builder + custom-tag-prefix: rocm6.4 + docker-build-dir: .ci/docker + working-directory: pytorch + - name: Pull Docker image + uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.8 + with: + docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} + - name: Test Pytorch binary + uses: ./pytorch/.github/actions/test-pytorch-binary + env: + DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }} + - name: Teardown ROCm + uses: ./.github/actions/teardown-rocm +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) diff --git a/.github/workflows/generated-linux-binary-manywheel-nightly.yml b/.github/workflows/generated-linux-binary-manywheel-nightly.yml index 6d5e940571fc..41b8ae2d9de2 100644 --- a/.github/workflows/generated-linux-binary-manywheel-nightly.yml +++ b/.github/workflows/generated-linux-binary-manywheel-nightly.yml @@ -18,6 +18,12 @@ on: - 'ciflow/binaries_wheel/*' workflow_dispatch: +<<<<<<< HEAD +======= +permissions: + id-token: write + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) env: # Needed for conda builds ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine" @@ -38,7 +44,11 @@ jobs: get-label-type: if: github.repository_owner == 'pytorch' name: get-label-type +<<<<<<< HEAD uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.7 +======= + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: triggering_actor: ${{ github.triggering_actor }} issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} @@ -55,8 +65,13 @@ jobs: # favor of GPU_ARCH_VERSION DESIRED_CUDA: cpu GPU_ARCH_TYPE: cpu +<<<<<<< HEAD DOCKER_IMAGE: pytorch/manylinux2_28-builder:cpu-2.7 DESIRED_DEVTOOLSET: cxx11-abi +======= + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: cpu +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) use_split_build: False DESIRED_PYTHON: "3.9" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" @@ -77,8 +92,13 @@ jobs: # favor of GPU_ARCH_VERSION DESIRED_CUDA: cpu GPU_ARCH_TYPE: cpu +<<<<<<< HEAD DOCKER_IMAGE: pytorch/manylinux2_28-builder:cpu-2.7 DESIRED_DEVTOOLSET: cxx11-abi +======= + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: cpu +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) use_split_build: False DESIRED_PYTHON: "3.9" build_name: manywheel-py3_9-cpu @@ -100,8 +120,13 @@ jobs: # favor of GPU_ARCH_VERSION DESIRED_CUDA: cpu GPU_ARCH_TYPE: cpu +<<<<<<< HEAD DOCKER_IMAGE: pytorch/manylinux2_28-builder:cpu-2.7 DESIRED_DEVTOOLSET: cxx11-abi +======= + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: cpu +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) use_split_build: False DESIRED_PYTHON: "3.9" build_name: manywheel-py3_9-cpu @@ -109,6 +134,7 @@ jobs: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml +<<<<<<< HEAD manywheel-py3_9-cpu-cxx11-abi-build: if: ${{ github.repository_owner == 'pytorch' }} uses: ./.github/workflows/_binary-build-linux.yml @@ -243,6 +269,8 @@ jobs: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) manywheel-py3_9-cuda12_6-build: if: ${{ github.repository_owner == 'pytorch' }} uses: ./.github/workflows/_binary-build-linux.yml @@ -255,14 +283,23 @@ jobs: DESIRED_CUDA: cu126 GPU_ARCH_VERSION: 12.6 GPU_ARCH_TYPE: cuda +<<<<<<< HEAD DOCKER_IMAGE: pytorch/manylinux2_28-builder:cuda12.6-2.7 DESIRED_DEVTOOLSET: cxx11-abi +======= + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.6 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) use_split_build: False DESIRED_PYTHON: "3.9" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build_name: manywheel-py3_9-cuda12_6 build_environment: linux-binary-manywheel +<<<<<<< HEAD PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64' +======= + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64' +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_9-cuda12_6-test: # Testing @@ -279,14 +316,23 @@ jobs: DESIRED_CUDA: cu126 GPU_ARCH_VERSION: 12.6 GPU_ARCH_TYPE: cuda +<<<<<<< HEAD DOCKER_IMAGE: pytorch/manylinux2_28-builder:cuda12.6-2.7 DESIRED_DEVTOOLSET: cxx11-abi +======= + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.6 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) use_split_build: False DESIRED_PYTHON: "3.9" build_name: manywheel-py3_9-cuda12_6 build_environment: linux-binary-manywheel runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" +<<<<<<< HEAD runs_on: linux.4xlarge.nvidia.gpu +======= + runs_on: linux.4xlarge.nvidia.gpu # for other cuda versions, we use 4xlarge runner +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_9-cuda12_6-upload: # Uploading @@ -303,8 +349,13 @@ jobs: DESIRED_CUDA: cu126 GPU_ARCH_VERSION: 12.6 GPU_ARCH_TYPE: cuda +<<<<<<< HEAD DOCKER_IMAGE: pytorch/manylinux2_28-builder:cuda12.6-2.7 DESIRED_DEVTOOLSET: cxx11-abi +======= + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.6 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) use_split_build: False DESIRED_PYTHON: "3.9" build_name: manywheel-py3_9-cuda12_6 @@ -324,14 +375,23 @@ jobs: DESIRED_CUDA: cu128 GPU_ARCH_VERSION: 12.8 GPU_ARCH_TYPE: cuda +<<<<<<< HEAD DOCKER_IMAGE: pytorch/manylinux2_28-builder:cuda12.8-2.7 DESIRED_DEVTOOLSET: cxx11-abi +======= + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) use_split_build: False DESIRED_PYTHON: "3.9" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build_name: manywheel-py3_9-cuda12_8 build_environment: linux-binary-manywheel +<<<<<<< HEAD PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.7.1.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.3.14; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.41; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.2.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.7.53; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.0.11; platform_system == 'Linux' and platform_machine == 'x86_64' +======= + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_9-cuda12_8-test: # Testing @@ -348,14 +408,23 @@ jobs: DESIRED_CUDA: cu128 GPU_ARCH_VERSION: 12.8 GPU_ARCH_TYPE: cuda +<<<<<<< HEAD DOCKER_IMAGE: pytorch/manylinux2_28-builder:cuda12.8-2.7 DESIRED_DEVTOOLSET: cxx11-abi +======= + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) use_split_build: False DESIRED_PYTHON: "3.9" build_name: manywheel-py3_9-cuda12_8 build_environment: linux-binary-manywheel runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" +<<<<<<< HEAD runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8 build needs sm_70+ runner +======= + runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8 and 12.9 build need sm_70+ runner +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_9-cuda12_8-upload: # Uploading @@ -372,8 +441,13 @@ jobs: DESIRED_CUDA: cu128 GPU_ARCH_VERSION: 12.8 GPU_ARCH_TYPE: cuda +<<<<<<< HEAD DOCKER_IMAGE: pytorch/manylinux2_28-builder:cuda12.8-2.7 DESIRED_DEVTOOLSET: cxx11-abi +======= + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) use_split_build: False DESIRED_PYTHON: "3.9" build_name: manywheel-py3_9-cuda12_8 @@ -381,7 +455,11 @@ jobs: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml +<<<<<<< HEAD manywheel-py3_9-rocm6_2_4-build: +======= + manywheel-py3_9-cuda12_9-build: +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if: ${{ github.repository_owner == 'pytorch' }} uses: ./.github/workflows/_binary-build-linux.yml needs: get-label-type @@ -390,6 +468,7 @@ jobs: PACKAGE_TYPE: manywheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION +<<<<<<< HEAD DESIRED_CUDA: rocm6.2.4 GPU_ARCH_VERSION: 6.2.4 GPU_ARCH_TYPE: rocm @@ -458,11 +537,33 @@ jobs: id-token: write contents: read needs: manywheel-py3_9-rocm6_2_4-test +======= + DESIRED_CUDA: cu129 + GPU_ARCH_VERSION: 12.9 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.9 + use_split_build: False + DESIRED_PYTHON: "3.9" + runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + build_name: manywheel-py3_9-cuda12_9 + build_environment: linux-binary-manywheel + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64' + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + manywheel-py3_9-cuda12_9-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: + - manywheel-py3_9-cuda12_9-build + - get-label-type + uses: ./.github/workflows/_binary-test-linux.yml +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: PYTORCH_ROOT: /pytorch PACKAGE_TYPE: manywheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION +<<<<<<< HEAD DESIRED_CUDA: rocm6.2.4 GPU_ARCH_VERSION: 6.2.4 GPU_ARCH_TYPE: rocm @@ -471,6 +572,40 @@ jobs: use_split_build: False DESIRED_PYTHON: "3.9" build_name: manywheel-py3_9-rocm6_2_4 +======= + DESIRED_CUDA: cu129 + GPU_ARCH_VERSION: 12.9 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.9 + use_split_build: False + DESIRED_PYTHON: "3.9" + build_name: manywheel-py3_9-cuda12_9 + build_environment: linux-binary-manywheel + runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8 and 12.9 build need sm_70+ runner + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + manywheel-py3_9-cuda12_9-upload: # Uploading + if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read + needs: manywheel-py3_9-cuda12_9-test + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu129 + GPU_ARCH_VERSION: 12.9 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.9 + use_split_build: False + DESIRED_PYTHON: "3.9" + build_name: manywheel-py3_9-cuda12_9 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml @@ -487,8 +622,13 @@ jobs: DESIRED_CUDA: rocm6.3 GPU_ARCH_VERSION: 6.3 GPU_ARCH_TYPE: rocm +<<<<<<< HEAD DOCKER_IMAGE: pytorch/manylinux2_28-builder:rocm6.3-2.7 DESIRED_DEVTOOLSET: cxx11-abi +======= + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: rocm6.3 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) use_split_build: False DESIRED_PYTHON: "3.9" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" @@ -501,7 +641,11 @@ jobs: needs: - manywheel-py3_9-rocm6_3-build - get-label-type +<<<<<<< HEAD runs-on: linux.rocm.gpu +======= + runs-on: linux.rocm.gpu.mi250 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) timeout-minutes: 240 env: PYTORCH_ROOT: /pytorch @@ -512,8 +656,13 @@ jobs: GPU_ARCH_VERSION: 6.3 GPU_ARCH_TYPE: rocm SKIP_ALL_TESTS: 1 +<<<<<<< HEAD DOCKER_IMAGE: pytorch/manylinux2_28-builder:rocm6.3-2.7 DESIRED_DEVTOOLSET: cxx11-abi +======= + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: rocm6.3 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) use_split_build: False DESIRED_PYTHON: "3.9" steps: @@ -538,12 +687,40 @@ jobs: - name: ROCm set GPU_FLAG run: | echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}" +<<<<<<< HEAD - name: Pull Docker image uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.7 with: docker-image: pytorch/manylinux2_28-builder:rocm6.3-2.7 - name: Test Pytorch binary uses: ./pytorch/.github/actions/test-pytorch-binary +======= + - name: configure aws credentials + id: aws_creds + if: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') }} + uses: aws-actions/configure-aws-credentials@v4 + with: + role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only + aws-region: us-east-1 + role-duration-seconds: 18000 + - name: Calculate docker image + id: calculate-docker-image + uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.8 + with: + docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }} + docker-image-name: manylinux2_28-builder + custom-tag-prefix: rocm6.3 + docker-build-dir: .ci/docker + working-directory: pytorch + - name: Pull Docker image + uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.8 + with: + docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} + - name: Test Pytorch binary + uses: ./pytorch/.github/actions/test-pytorch-binary + env: + DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }} +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: Teardown ROCm uses: ./.github/actions/teardown-rocm manywheel-py3_9-rocm6_3-upload: # Uploading @@ -560,8 +737,13 @@ jobs: DESIRED_CUDA: rocm6.3 GPU_ARCH_VERSION: 6.3 GPU_ARCH_TYPE: rocm +<<<<<<< HEAD DOCKER_IMAGE: pytorch/manylinux2_28-builder:rocm6.3-2.7 DESIRED_DEVTOOLSET: cxx11-abi +======= + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: rocm6.3 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) use_split_build: False DESIRED_PYTHON: "3.9" build_name: manywheel-py3_9-rocm6_3 @@ -569,6 +751,122 @@ jobs: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml +<<<<<<< HEAD +======= + manywheel-py3_9-rocm6_4-build: + if: ${{ github.repository_owner == 'pytorch' }} + uses: ./.github/workflows/_binary-build-linux.yml + needs: get-label-type + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: rocm6.4 + GPU_ARCH_VERSION: 6.4 + GPU_ARCH_TYPE: rocm + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: rocm6.4 + use_split_build: False + DESIRED_PYTHON: "3.9" + runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + build_name: manywheel-py3_9-rocm6_4 + build_environment: linux-binary-manywheel + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + manywheel-py3_9-rocm6_4-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: + - manywheel-py3_9-rocm6_4-build + - get-label-type + runs-on: linux.rocm.gpu.mi250 + timeout-minutes: 240 + env: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: rocm6.4 + GPU_ARCH_VERSION: 6.4 + GPU_ARCH_TYPE: rocm + SKIP_ALL_TESTS: 1 + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: rocm6.4 + use_split_build: False + DESIRED_PYTHON: "3.9" + steps: + - name: Setup ROCm + uses: ./.github/actions/setup-rocm + - uses: actions/download-artifact@v4.1.7 + name: Download Build Artifacts + with: + name: manywheel-py3_9-rocm6_4 + path: "${{ runner.temp }}/artifacts/" + - name: Checkout PyTorch + uses: actions/checkout@v4 + with: + submodules: recursive + path: pytorch + show-progress: false + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: ROCm set GPU_FLAG + run: | + echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}" + - name: configure aws credentials + id: aws_creds + if: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') }} + uses: aws-actions/configure-aws-credentials@v4 + with: + role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only + aws-region: us-east-1 + role-duration-seconds: 18000 + - name: Calculate docker image + id: calculate-docker-image + uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.8 + with: + docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }} + docker-image-name: manylinux2_28-builder + custom-tag-prefix: rocm6.4 + docker-build-dir: .ci/docker + working-directory: pytorch + - name: Pull Docker image + uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.8 + with: + docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} + - name: Test Pytorch binary + uses: ./pytorch/.github/actions/test-pytorch-binary + env: + DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }} + - name: Teardown ROCm + uses: ./.github/actions/teardown-rocm + manywheel-py3_9-rocm6_4-upload: # Uploading + if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read + needs: manywheel-py3_9-rocm6_4-test + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: rocm6.4 + GPU_ARCH_VERSION: 6.4 + GPU_ARCH_TYPE: rocm + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: rocm6.4 + use_split_build: False + DESIRED_PYTHON: "3.9" + build_name: manywheel-py3_9-rocm6_4 + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + uses: ./.github/workflows/_binary-upload.yml + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) manywheel-py3_9-xpu-build: if: ${{ github.repository_owner == 'pytorch' }} uses: ./.github/workflows/_binary-build-linux.yml @@ -580,14 +878,23 @@ jobs: # favor of GPU_ARCH_VERSION DESIRED_CUDA: xpu GPU_ARCH_TYPE: xpu +<<<<<<< HEAD DOCKER_IMAGE: pytorch/manylinux2_28-builder:xpu-2.7 DESIRED_DEVTOOLSET: cxx11-abi +======= + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: xpu +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) use_split_build: False DESIRED_PYTHON: "3.9" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build_name: manywheel-py3_9-xpu build_environment: linux-binary-manywheel +<<<<<<< HEAD PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.0.4; platform_system == 'Linux' | intel-cmplr-lib-ur==2025.0.4; platform_system == 'Linux' | intel-cmplr-lic-rt==2025.0.4; platform_system == 'Linux' | intel-sycl-rt==2025.0.4; platform_system == 'Linux' | intel-cmplr-lib-rt==2025.0.5; platform_system == 'Windows' | intel-cmplr-lib-ur==2025.0.5; platform_system == 'Windows' | intel-cmplr-lic-rt==2025.0.5; platform_system == 'Windows' | intel-sycl-rt==2025.0.5; platform_system == 'Windows' | tcmlib==1.2.0 | umf==0.9.1 | intel-pti==0.10.1 +======= + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.1.1 | intel-cmplr-lib-ur==2025.1.1 | intel-cmplr-lic-rt==2025.1.1 | intel-sycl-rt==2025.1.1 | oneccl-devel==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.15.0; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.1.0 | onemkl-sycl-dft==2025.1.0 | onemkl-sycl-lapack==2025.1.0 | onemkl-sycl-rng==2025.1.0 | onemkl-sycl-sparse==2025.1.0 | dpcpp-cpp-rt==2025.1.1 | intel-opencl-rt==2025.1.1 | mkl==2025.1.0 | intel-openmp==2025.1.1 | tbb==2022.1.0 | tcmlib==1.3.0 | umf==0.10.0 | intel-pti==0.12.3 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_9-xpu-test: # Testing @@ -605,8 +912,13 @@ jobs: DESIRED_CUDA: xpu GPU_ARCH_TYPE: xpu SKIP_ALL_TESTS: 1 +<<<<<<< HEAD DOCKER_IMAGE: pytorch/manylinux2_28-builder:xpu-2.7 DESIRED_DEVTOOLSET: cxx11-abi +======= + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: xpu +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) use_split_build: False DESIRED_PYTHON: "3.9" permissions: @@ -617,7 +929,11 @@ jobs: uses: ./.github/actions/setup-xpu - name: configure aws credentials id: aws_creds +<<<<<<< HEAD uses: aws-actions/configure-aws-credentials@v1.7.0 +======= + uses: aws-actions/configure-aws-credentials@v4 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only aws-region: us-east-1 @@ -640,12 +956,32 @@ jobs: # Remove any artifacts from the previous checkouts git clean -fxd working-directory: pytorch +<<<<<<< HEAD - name: Pull Docker image uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.7 with: docker-image: pytorch/manylinux2_28-builder:xpu-2.7 - name: Test Pytorch binary uses: ./pytorch/.github/actions/test-pytorch-binary +======= + - name: Calculate docker image + id: calculate-docker-image + uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.8 + with: + docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }} + docker-image-name: manylinux2_28-builder + custom-tag-prefix: xpu + docker-build-dir: .ci/docker + working-directory: pytorch + - name: Pull Docker image + uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.8 + with: + docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} + - name: Test Pytorch binary + uses: ./pytorch/.github/actions/test-pytorch-binary + env: + DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }} +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: Teardown XPU uses: ./.github/actions/teardown-xpu manywheel-py3_9-xpu-upload: # Uploading @@ -661,8 +997,13 @@ jobs: # favor of GPU_ARCH_VERSION DESIRED_CUDA: xpu GPU_ARCH_TYPE: xpu +<<<<<<< HEAD DOCKER_IMAGE: pytorch/manylinux2_28-builder:xpu-2.7 DESIRED_DEVTOOLSET: cxx11-abi +======= + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: xpu +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) use_split_build: False DESIRED_PYTHON: "3.9" build_name: manywheel-py3_9-xpu @@ -681,8 +1022,13 @@ jobs: # favor of GPU_ARCH_VERSION DESIRED_CUDA: cpu GPU_ARCH_TYPE: cpu +<<<<<<< HEAD DOCKER_IMAGE: pytorch/manylinux2_28-builder:cpu-2.7 DESIRED_DEVTOOLSET: cxx11-abi +======= + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: cpu +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) use_split_build: False DESIRED_PYTHON: "3.10" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" @@ -703,8 +1049,13 @@ jobs: # favor of GPU_ARCH_VERSION DESIRED_CUDA: cpu GPU_ARCH_TYPE: cpu +<<<<<<< HEAD DOCKER_IMAGE: pytorch/manylinux2_28-builder:cpu-2.7 DESIRED_DEVTOOLSET: cxx11-abi +======= + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: cpu +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) use_split_build: False DESIRED_PYTHON: "3.10" build_name: manywheel-py3_10-cpu @@ -726,8 +1077,13 @@ jobs: # favor of GPU_ARCH_VERSION DESIRED_CUDA: cpu GPU_ARCH_TYPE: cpu +<<<<<<< HEAD DOCKER_IMAGE: pytorch/manylinux2_28-builder:cpu-2.7 DESIRED_DEVTOOLSET: cxx11-abi +======= + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: cpu +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) use_split_build: False DESIRED_PYTHON: "3.10" build_name: manywheel-py3_10-cpu @@ -735,6 +1091,7 @@ jobs: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml +<<<<<<< HEAD manywheel-py3_10-cpu-cxx11-abi-build: if: ${{ github.repository_owner == 'pytorch' }} uses: ./.github/workflows/_binary-build-linux.yml @@ -869,6 +1226,8 @@ jobs: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) manywheel-py3_10-cuda12_6-build: if: ${{ github.repository_owner == 'pytorch' }} uses: ./.github/workflows/_binary-build-linux.yml @@ -881,14 +1240,23 @@ jobs: DESIRED_CUDA: cu126 GPU_ARCH_VERSION: 12.6 GPU_ARCH_TYPE: cuda +<<<<<<< HEAD DOCKER_IMAGE: pytorch/manylinux2_28-builder:cuda12.6-2.7 DESIRED_DEVTOOLSET: cxx11-abi +======= + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.6 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) use_split_build: False DESIRED_PYTHON: "3.10" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build_name: manywheel-py3_10-cuda12_6 build_environment: linux-binary-manywheel +<<<<<<< HEAD PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64' +======= + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64' +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_10-cuda12_6-test: # Testing @@ -905,14 +1273,23 @@ jobs: DESIRED_CUDA: cu126 GPU_ARCH_VERSION: 12.6 GPU_ARCH_TYPE: cuda +<<<<<<< HEAD DOCKER_IMAGE: pytorch/manylinux2_28-builder:cuda12.6-2.7 DESIRED_DEVTOOLSET: cxx11-abi +======= + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.6 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) use_split_build: False DESIRED_PYTHON: "3.10" build_name: manywheel-py3_10-cuda12_6 build_environment: linux-binary-manywheel runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" +<<<<<<< HEAD runs_on: linux.4xlarge.nvidia.gpu +======= + runs_on: linux.4xlarge.nvidia.gpu # for other cuda versions, we use 4xlarge runner +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_10-cuda12_6-upload: # Uploading @@ -929,8 +1306,13 @@ jobs: DESIRED_CUDA: cu126 GPU_ARCH_VERSION: 12.6 GPU_ARCH_TYPE: cuda +<<<<<<< HEAD DOCKER_IMAGE: pytorch/manylinux2_28-builder:cuda12.6-2.7 DESIRED_DEVTOOLSET: cxx11-abi +======= + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.6 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) use_split_build: False DESIRED_PYTHON: "3.10" build_name: manywheel-py3_10-cuda12_6 @@ -950,14 +1332,23 @@ jobs: DESIRED_CUDA: cu128 GPU_ARCH_VERSION: 12.8 GPU_ARCH_TYPE: cuda +<<<<<<< HEAD DOCKER_IMAGE: pytorch/manylinux2_28-builder:cuda12.8-2.7 DESIRED_DEVTOOLSET: cxx11-abi +======= + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) use_split_build: False DESIRED_PYTHON: "3.10" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build_name: manywheel-py3_10-cuda12_8 build_environment: linux-binary-manywheel +<<<<<<< HEAD PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.7.1.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.3.14; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.41; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.2.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.7.53; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.0.11; platform_system == 'Linux' and platform_machine == 'x86_64' +======= + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_10-cuda12_8-test: # Testing @@ -974,14 +1365,23 @@ jobs: DESIRED_CUDA: cu128 GPU_ARCH_VERSION: 12.8 GPU_ARCH_TYPE: cuda +<<<<<<< HEAD DOCKER_IMAGE: pytorch/manylinux2_28-builder:cuda12.8-2.7 DESIRED_DEVTOOLSET: cxx11-abi +======= + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) use_split_build: False DESIRED_PYTHON: "3.10" build_name: manywheel-py3_10-cuda12_8 build_environment: linux-binary-manywheel runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" +<<<<<<< HEAD runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8 build needs sm_70+ runner +======= + runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8 and 12.9 build need sm_70+ runner +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_10-cuda12_8-upload: # Uploading @@ -998,8 +1398,13 @@ jobs: DESIRED_CUDA: cu128 GPU_ARCH_VERSION: 12.8 GPU_ARCH_TYPE: cuda +<<<<<<< HEAD DOCKER_IMAGE: pytorch/manylinux2_28-builder:cuda12.8-2.7 DESIRED_DEVTOOLSET: cxx11-abi +======= + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) use_split_build: False DESIRED_PYTHON: "3.10" build_name: manywheel-py3_10-cuda12_8 @@ -1007,7 +1412,11 @@ jobs: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml +<<<<<<< HEAD manywheel-py3_10-rocm6_2_4-build: +======= + manywheel-py3_10-cuda12_9-build: +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if: ${{ github.repository_owner == 'pytorch' }} uses: ./.github/workflows/_binary-build-linux.yml needs: get-label-type @@ -1016,6 +1425,7 @@ jobs: PACKAGE_TYPE: manywheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION +<<<<<<< HEAD DESIRED_CUDA: rocm6.2.4 GPU_ARCH_VERSION: 6.2.4 GPU_ARCH_TYPE: rocm @@ -1084,11 +1494,33 @@ jobs: id-token: write contents: read needs: manywheel-py3_10-rocm6_2_4-test +======= + DESIRED_CUDA: cu129 + GPU_ARCH_VERSION: 12.9 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.9 + use_split_build: False + DESIRED_PYTHON: "3.10" + runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + build_name: manywheel-py3_10-cuda12_9 + build_environment: linux-binary-manywheel + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64' + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + manywheel-py3_10-cuda12_9-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: + - manywheel-py3_10-cuda12_9-build + - get-label-type + uses: ./.github/workflows/_binary-test-linux.yml +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: PYTORCH_ROOT: /pytorch PACKAGE_TYPE: manywheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION +<<<<<<< HEAD DESIRED_CUDA: rocm6.2.4 GPU_ARCH_VERSION: 6.2.4 GPU_ARCH_TYPE: rocm @@ -1097,6 +1529,40 @@ jobs: use_split_build: False DESIRED_PYTHON: "3.10" build_name: manywheel-py3_10-rocm6_2_4 +======= + DESIRED_CUDA: cu129 + GPU_ARCH_VERSION: 12.9 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.9 + use_split_build: False + DESIRED_PYTHON: "3.10" + build_name: manywheel-py3_10-cuda12_9 + build_environment: linux-binary-manywheel + runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8 and 12.9 build need sm_70+ runner + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + manywheel-py3_10-cuda12_9-upload: # Uploading + if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read + needs: manywheel-py3_10-cuda12_9-test + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu129 + GPU_ARCH_VERSION: 12.9 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.9 + use_split_build: False + DESIRED_PYTHON: "3.10" + build_name: manywheel-py3_10-cuda12_9 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml @@ -1113,8 +1579,13 @@ jobs: DESIRED_CUDA: rocm6.3 GPU_ARCH_VERSION: 6.3 GPU_ARCH_TYPE: rocm +<<<<<<< HEAD DOCKER_IMAGE: pytorch/manylinux2_28-builder:rocm6.3-2.7 DESIRED_DEVTOOLSET: cxx11-abi +======= + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: rocm6.3 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) use_split_build: False DESIRED_PYTHON: "3.10" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" @@ -1127,7 +1598,11 @@ jobs: needs: - manywheel-py3_10-rocm6_3-build - get-label-type +<<<<<<< HEAD runs-on: linux.rocm.gpu +======= + runs-on: linux.rocm.gpu.mi250 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) timeout-minutes: 240 env: PYTORCH_ROOT: /pytorch @@ -1138,8 +1613,13 @@ jobs: GPU_ARCH_VERSION: 6.3 GPU_ARCH_TYPE: rocm SKIP_ALL_TESTS: 1 +<<<<<<< HEAD DOCKER_IMAGE: pytorch/manylinux2_28-builder:rocm6.3-2.7 DESIRED_DEVTOOLSET: cxx11-abi +======= + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: rocm6.3 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) use_split_build: False DESIRED_PYTHON: "3.10" steps: @@ -1164,12 +1644,40 @@ jobs: - name: ROCm set GPU_FLAG run: | echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}" +<<<<<<< HEAD - name: Pull Docker image uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.7 with: docker-image: pytorch/manylinux2_28-builder:rocm6.3-2.7 - name: Test Pytorch binary uses: ./pytorch/.github/actions/test-pytorch-binary +======= + - name: configure aws credentials + id: aws_creds + if: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') }} + uses: aws-actions/configure-aws-credentials@v4 + with: + role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only + aws-region: us-east-1 + role-duration-seconds: 18000 + - name: Calculate docker image + id: calculate-docker-image + uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.8 + with: + docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }} + docker-image-name: manylinux2_28-builder + custom-tag-prefix: rocm6.3 + docker-build-dir: .ci/docker + working-directory: pytorch + - name: Pull Docker image + uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.8 + with: + docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} + - name: Test Pytorch binary + uses: ./pytorch/.github/actions/test-pytorch-binary + env: + DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }} +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: Teardown ROCm uses: ./.github/actions/teardown-rocm manywheel-py3_10-rocm6_3-upload: # Uploading @@ -1186,8 +1694,13 @@ jobs: DESIRED_CUDA: rocm6.3 GPU_ARCH_VERSION: 6.3 GPU_ARCH_TYPE: rocm +<<<<<<< HEAD DOCKER_IMAGE: pytorch/manylinux2_28-builder:rocm6.3-2.7 DESIRED_DEVTOOLSET: cxx11-abi +======= + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: rocm6.3 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) use_split_build: False DESIRED_PYTHON: "3.10" build_name: manywheel-py3_10-rocm6_3 @@ -1195,6 +1708,122 @@ jobs: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml +<<<<<<< HEAD +======= + manywheel-py3_10-rocm6_4-build: + if: ${{ github.repository_owner == 'pytorch' }} + uses: ./.github/workflows/_binary-build-linux.yml + needs: get-label-type + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: rocm6.4 + GPU_ARCH_VERSION: 6.4 + GPU_ARCH_TYPE: rocm + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: rocm6.4 + use_split_build: False + DESIRED_PYTHON: "3.10" + runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + build_name: manywheel-py3_10-rocm6_4 + build_environment: linux-binary-manywheel + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + manywheel-py3_10-rocm6_4-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: + - manywheel-py3_10-rocm6_4-build + - get-label-type + runs-on: linux.rocm.gpu.mi250 + timeout-minutes: 240 + env: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: rocm6.4 + GPU_ARCH_VERSION: 6.4 + GPU_ARCH_TYPE: rocm + SKIP_ALL_TESTS: 1 + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: rocm6.4 + use_split_build: False + DESIRED_PYTHON: "3.10" + steps: + - name: Setup ROCm + uses: ./.github/actions/setup-rocm + - uses: actions/download-artifact@v4.1.7 + name: Download Build Artifacts + with: + name: manywheel-py3_10-rocm6_4 + path: "${{ runner.temp }}/artifacts/" + - name: Checkout PyTorch + uses: actions/checkout@v4 + with: + submodules: recursive + path: pytorch + show-progress: false + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: ROCm set GPU_FLAG + run: | + echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}" + - name: configure aws credentials + id: aws_creds + if: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') }} + uses: aws-actions/configure-aws-credentials@v4 + with: + role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only + aws-region: us-east-1 + role-duration-seconds: 18000 + - name: Calculate docker image + id: calculate-docker-image + uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.8 + with: + docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }} + docker-image-name: manylinux2_28-builder + custom-tag-prefix: rocm6.4 + docker-build-dir: .ci/docker + working-directory: pytorch + - name: Pull Docker image + uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.8 + with: + docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} + - name: Test Pytorch binary + uses: ./pytorch/.github/actions/test-pytorch-binary + env: + DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }} + - name: Teardown ROCm + uses: ./.github/actions/teardown-rocm + manywheel-py3_10-rocm6_4-upload: # Uploading + if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read + needs: manywheel-py3_10-rocm6_4-test + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: rocm6.4 + GPU_ARCH_VERSION: 6.4 + GPU_ARCH_TYPE: rocm + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: rocm6.4 + use_split_build: False + DESIRED_PYTHON: "3.10" + build_name: manywheel-py3_10-rocm6_4 + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + uses: ./.github/workflows/_binary-upload.yml + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) manywheel-py3_10-xpu-build: if: ${{ github.repository_owner == 'pytorch' }} uses: ./.github/workflows/_binary-build-linux.yml @@ -1206,14 +1835,23 @@ jobs: # favor of GPU_ARCH_VERSION DESIRED_CUDA: xpu GPU_ARCH_TYPE: xpu +<<<<<<< HEAD DOCKER_IMAGE: pytorch/manylinux2_28-builder:xpu-2.7 DESIRED_DEVTOOLSET: cxx11-abi +======= + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: xpu +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) use_split_build: False DESIRED_PYTHON: "3.10" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build_name: manywheel-py3_10-xpu build_environment: linux-binary-manywheel +<<<<<<< HEAD PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.0.4; platform_system == 'Linux' | intel-cmplr-lib-ur==2025.0.4; platform_system == 'Linux' | intel-cmplr-lic-rt==2025.0.4; platform_system == 'Linux' | intel-sycl-rt==2025.0.4; platform_system == 'Linux' | intel-cmplr-lib-rt==2025.0.5; platform_system == 'Windows' | intel-cmplr-lib-ur==2025.0.5; platform_system == 'Windows' | intel-cmplr-lic-rt==2025.0.5; platform_system == 'Windows' | intel-sycl-rt==2025.0.5; platform_system == 'Windows' | tcmlib==1.2.0 | umf==0.9.1 | intel-pti==0.10.1 +======= + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.1.1 | intel-cmplr-lib-ur==2025.1.1 | intel-cmplr-lic-rt==2025.1.1 | intel-sycl-rt==2025.1.1 | oneccl-devel==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.15.0; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.1.0 | onemkl-sycl-dft==2025.1.0 | onemkl-sycl-lapack==2025.1.0 | onemkl-sycl-rng==2025.1.0 | onemkl-sycl-sparse==2025.1.0 | dpcpp-cpp-rt==2025.1.1 | intel-opencl-rt==2025.1.1 | mkl==2025.1.0 | intel-openmp==2025.1.1 | tbb==2022.1.0 | tcmlib==1.3.0 | umf==0.10.0 | intel-pti==0.12.3 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_10-xpu-test: # Testing @@ -1231,8 +1869,13 @@ jobs: DESIRED_CUDA: xpu GPU_ARCH_TYPE: xpu SKIP_ALL_TESTS: 1 +<<<<<<< HEAD DOCKER_IMAGE: pytorch/manylinux2_28-builder:xpu-2.7 DESIRED_DEVTOOLSET: cxx11-abi +======= + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: xpu +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) use_split_build: False DESIRED_PYTHON: "3.10" permissions: @@ -1243,7 +1886,11 @@ jobs: uses: ./.github/actions/setup-xpu - name: configure aws credentials id: aws_creds +<<<<<<< HEAD uses: aws-actions/configure-aws-credentials@v1.7.0 +======= + uses: aws-actions/configure-aws-credentials@v4 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only aws-region: us-east-1 @@ -1266,12 +1913,32 @@ jobs: # Remove any artifacts from the previous checkouts git clean -fxd working-directory: pytorch +<<<<<<< HEAD - name: Pull Docker image uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.7 with: docker-image: pytorch/manylinux2_28-builder:xpu-2.7 - name: Test Pytorch binary uses: ./pytorch/.github/actions/test-pytorch-binary +======= + - name: Calculate docker image + id: calculate-docker-image + uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.8 + with: + docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }} + docker-image-name: manylinux2_28-builder + custom-tag-prefix: xpu + docker-build-dir: .ci/docker + working-directory: pytorch + - name: Pull Docker image + uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.8 + with: + docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} + - name: Test Pytorch binary + uses: ./pytorch/.github/actions/test-pytorch-binary + env: + DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }} +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: Teardown XPU uses: ./.github/actions/teardown-xpu manywheel-py3_10-xpu-upload: # Uploading @@ -1287,8 +1954,13 @@ jobs: # favor of GPU_ARCH_VERSION DESIRED_CUDA: xpu GPU_ARCH_TYPE: xpu +<<<<<<< HEAD DOCKER_IMAGE: pytorch/manylinux2_28-builder:xpu-2.7 DESIRED_DEVTOOLSET: cxx11-abi +======= + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: xpu +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) use_split_build: False DESIRED_PYTHON: "3.10" build_name: manywheel-py3_10-xpu @@ -1307,8 +1979,13 @@ jobs: # favor of GPU_ARCH_VERSION DESIRED_CUDA: cpu GPU_ARCH_TYPE: cpu +<<<<<<< HEAD DOCKER_IMAGE: pytorch/manylinux2_28-builder:cpu-2.7 DESIRED_DEVTOOLSET: cxx11-abi +======= + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: cpu +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) use_split_build: False DESIRED_PYTHON: "3.11" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" @@ -1329,8 +2006,13 @@ jobs: # favor of GPU_ARCH_VERSION DESIRED_CUDA: cpu GPU_ARCH_TYPE: cpu +<<<<<<< HEAD DOCKER_IMAGE: pytorch/manylinux2_28-builder:cpu-2.7 DESIRED_DEVTOOLSET: cxx11-abi +======= + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: cpu +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) use_split_build: False DESIRED_PYTHON: "3.11" build_name: manywheel-py3_11-cpu @@ -1352,8 +2034,13 @@ jobs: # favor of GPU_ARCH_VERSION DESIRED_CUDA: cpu GPU_ARCH_TYPE: cpu +<<<<<<< HEAD DOCKER_IMAGE: pytorch/manylinux2_28-builder:cpu-2.7 DESIRED_DEVTOOLSET: cxx11-abi +======= + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: cpu +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) use_split_build: False DESIRED_PYTHON: "3.11" build_name: manywheel-py3_11-cpu @@ -1361,6 +2048,7 @@ jobs: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml +<<<<<<< HEAD manywheel-py3_11-cpu-cxx11-abi-build: if: ${{ github.repository_owner == 'pytorch' }} uses: ./.github/workflows/_binary-build-linux.yml @@ -1495,6 +2183,8 @@ jobs: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) manywheel-py3_11-cuda12_6-build: if: ${{ github.repository_owner == 'pytorch' }} uses: ./.github/workflows/_binary-build-linux.yml @@ -1507,14 +2197,23 @@ jobs: DESIRED_CUDA: cu126 GPU_ARCH_VERSION: 12.6 GPU_ARCH_TYPE: cuda +<<<<<<< HEAD DOCKER_IMAGE: pytorch/manylinux2_28-builder:cuda12.6-2.7 DESIRED_DEVTOOLSET: cxx11-abi +======= + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.6 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) use_split_build: False DESIRED_PYTHON: "3.11" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build_name: manywheel-py3_11-cuda12_6 build_environment: linux-binary-manywheel +<<<<<<< HEAD PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64' +======= + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64' +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_11-cuda12_6-test: # Testing @@ -1531,14 +2230,23 @@ jobs: DESIRED_CUDA: cu126 GPU_ARCH_VERSION: 12.6 GPU_ARCH_TYPE: cuda +<<<<<<< HEAD DOCKER_IMAGE: pytorch/manylinux2_28-builder:cuda12.6-2.7 DESIRED_DEVTOOLSET: cxx11-abi +======= + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.6 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) use_split_build: False DESIRED_PYTHON: "3.11" build_name: manywheel-py3_11-cuda12_6 build_environment: linux-binary-manywheel runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" +<<<<<<< HEAD runs_on: linux.4xlarge.nvidia.gpu +======= + runs_on: linux.4xlarge.nvidia.gpu # for other cuda versions, we use 4xlarge runner +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_11-cuda12_6-upload: # Uploading @@ -1555,8 +2263,13 @@ jobs: DESIRED_CUDA: cu126 GPU_ARCH_VERSION: 12.6 GPU_ARCH_TYPE: cuda +<<<<<<< HEAD DOCKER_IMAGE: pytorch/manylinux2_28-builder:cuda12.6-2.7 DESIRED_DEVTOOLSET: cxx11-abi +======= + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.6 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) use_split_build: False DESIRED_PYTHON: "3.11" build_name: manywheel-py3_11-cuda12_6 @@ -1564,6 +2277,7 @@ jobs: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml +<<<<<<< HEAD manywheel-py3_11-cuda12_6-full-build: if: ${{ github.repository_owner == 'pytorch' }} uses: ./.github/workflows/_binary-build-linux.yml @@ -1629,6 +2343,8 @@ jobs: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) manywheel-py3_11-cuda12_8-build: if: ${{ github.repository_owner == 'pytorch' }} uses: ./.github/workflows/_binary-build-linux.yml @@ -1641,14 +2357,23 @@ jobs: DESIRED_CUDA: cu128 GPU_ARCH_VERSION: 12.8 GPU_ARCH_TYPE: cuda +<<<<<<< HEAD DOCKER_IMAGE: pytorch/manylinux2_28-builder:cuda12.8-2.7 DESIRED_DEVTOOLSET: cxx11-abi +======= + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) use_split_build: False DESIRED_PYTHON: "3.11" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build_name: manywheel-py3_11-cuda12_8 build_environment: linux-binary-manywheel +<<<<<<< HEAD PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.7.1.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.3.14; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.41; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.2.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.7.53; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.0.11; platform_system == 'Linux' and platform_machine == 'x86_64' +======= + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_11-cuda12_8-test: # Testing @@ -1665,14 +2390,23 @@ jobs: DESIRED_CUDA: cu128 GPU_ARCH_VERSION: 12.8 GPU_ARCH_TYPE: cuda +<<<<<<< HEAD DOCKER_IMAGE: pytorch/manylinux2_28-builder:cuda12.8-2.7 DESIRED_DEVTOOLSET: cxx11-abi +======= + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) use_split_build: False DESIRED_PYTHON: "3.11" build_name: manywheel-py3_11-cuda12_8 build_environment: linux-binary-manywheel runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" +<<<<<<< HEAD runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8 build needs sm_70+ runner +======= + runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8 and 12.9 build need sm_70+ runner +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_11-cuda12_8-upload: # Uploading @@ -1689,8 +2423,13 @@ jobs: DESIRED_CUDA: cu128 GPU_ARCH_VERSION: 12.8 GPU_ARCH_TYPE: cuda +<<<<<<< HEAD DOCKER_IMAGE: pytorch/manylinux2_28-builder:cuda12.8-2.7 DESIRED_DEVTOOLSET: cxx11-abi +======= + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) use_split_build: False DESIRED_PYTHON: "3.11" build_name: manywheel-py3_11-cuda12_8 @@ -1698,7 +2437,11 @@ jobs: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml +<<<<<<< HEAD manywheel-py3_11-rocm6_2_4-build: +======= + manywheel-py3_11-cuda12_8-full-build: +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if: ${{ github.repository_owner == 'pytorch' }} uses: ./.github/workflows/_binary-build-linux.yml needs: get-label-type @@ -1707,6 +2450,7 @@ jobs: PACKAGE_TYPE: manywheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION +<<<<<<< HEAD DESIRED_CUDA: rocm6.2.4 GPU_ARCH_VERSION: 6.2.4 GPU_ARCH_TYPE: rocm @@ -1775,19 +2519,143 @@ jobs: id-token: write contents: read needs: manywheel-py3_11-rocm6_2_4-test - with: - PYTORCH_ROOT: /pytorch - PACKAGE_TYPE: manywheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: rocm6.2.4 - GPU_ARCH_VERSION: 6.2.4 - GPU_ARCH_TYPE: rocm - DOCKER_IMAGE: pytorch/manylinux2_28-builder:rocm6.2.4-2.7 - DESIRED_DEVTOOLSET: cxx11-abi - use_split_build: False +======= + DESIRED_CUDA: cu128 + GPU_ARCH_VERSION: 12.8 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.8 + use_split_build: False + DESIRED_PYTHON: "3.11" + runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + build_name: manywheel-py3_11-cuda12_8-full + build_environment: linux-binary-manywheel + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + manywheel-py3_11-cuda12_8-full-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: + - manywheel-py3_11-cuda12_8-full-build + - get-label-type + uses: ./.github/workflows/_binary-test-linux.yml +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION +<<<<<<< HEAD + DESIRED_CUDA: rocm6.2.4 + GPU_ARCH_VERSION: 6.2.4 + GPU_ARCH_TYPE: rocm + DOCKER_IMAGE: pytorch/manylinux2_28-builder:rocm6.2.4-2.7 + DESIRED_DEVTOOLSET: cxx11-abi + use_split_build: False DESIRED_PYTHON: "3.11" build_name: manywheel-py3_11-rocm6_2_4 +======= + DESIRED_CUDA: cu128 + GPU_ARCH_VERSION: 12.8 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.8 + use_split_build: False + DESIRED_PYTHON: "3.11" + build_name: manywheel-py3_11-cuda12_8-full + build_environment: linux-binary-manywheel + runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8 and 12.9 build need sm_70+ runner + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + manywheel-py3_11-cuda12_8-full-upload: # Uploading + if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read + needs: manywheel-py3_11-cuda12_8-full-test + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu128 + GPU_ARCH_VERSION: 12.8 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.8 + use_split_build: False + DESIRED_PYTHON: "3.11" + build_name: manywheel-py3_11-cuda12_8-full + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + uses: ./.github/workflows/_binary-upload.yml + + manywheel-py3_11-cuda12_9-build: + if: ${{ github.repository_owner == 'pytorch' }} + uses: ./.github/workflows/_binary-build-linux.yml + needs: get-label-type + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu129 + GPU_ARCH_VERSION: 12.9 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.9 + use_split_build: False + DESIRED_PYTHON: "3.11" + runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + build_name: manywheel-py3_11-cuda12_9 + build_environment: linux-binary-manywheel + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64' + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + manywheel-py3_11-cuda12_9-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: + - manywheel-py3_11-cuda12_9-build + - get-label-type + uses: ./.github/workflows/_binary-test-linux.yml + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu129 + GPU_ARCH_VERSION: 12.9 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.9 + use_split_build: False + DESIRED_PYTHON: "3.11" + build_name: manywheel-py3_11-cuda12_9 + build_environment: linux-binary-manywheel + runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8 and 12.9 build need sm_70+ runner + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + manywheel-py3_11-cuda12_9-upload: # Uploading + if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read + needs: manywheel-py3_11-cuda12_9-test + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu129 + GPU_ARCH_VERSION: 12.9 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.9 + use_split_build: False + DESIRED_PYTHON: "3.11" + build_name: manywheel-py3_11-cuda12_9 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml @@ -1804,8 +2672,13 @@ jobs: DESIRED_CUDA: rocm6.3 GPU_ARCH_VERSION: 6.3 GPU_ARCH_TYPE: rocm +<<<<<<< HEAD DOCKER_IMAGE: pytorch/manylinux2_28-builder:rocm6.3-2.7 DESIRED_DEVTOOLSET: cxx11-abi +======= + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: rocm6.3 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) use_split_build: False DESIRED_PYTHON: "3.11" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" @@ -1818,7 +2691,11 @@ jobs: needs: - manywheel-py3_11-rocm6_3-build - get-label-type +<<<<<<< HEAD runs-on: linux.rocm.gpu +======= + runs-on: linux.rocm.gpu.mi250 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) timeout-minutes: 240 env: PYTORCH_ROOT: /pytorch @@ -1829,8 +2706,13 @@ jobs: GPU_ARCH_VERSION: 6.3 GPU_ARCH_TYPE: rocm SKIP_ALL_TESTS: 1 +<<<<<<< HEAD DOCKER_IMAGE: pytorch/manylinux2_28-builder:rocm6.3-2.7 DESIRED_DEVTOOLSET: cxx11-abi +======= + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: rocm6.3 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) use_split_build: False DESIRED_PYTHON: "3.11" steps: @@ -1855,12 +2737,40 @@ jobs: - name: ROCm set GPU_FLAG run: | echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}" +<<<<<<< HEAD - name: Pull Docker image uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.7 with: docker-image: pytorch/manylinux2_28-builder:rocm6.3-2.7 - name: Test Pytorch binary uses: ./pytorch/.github/actions/test-pytorch-binary +======= + - name: configure aws credentials + id: aws_creds + if: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') }} + uses: aws-actions/configure-aws-credentials@v4 + with: + role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only + aws-region: us-east-1 + role-duration-seconds: 18000 + - name: Calculate docker image + id: calculate-docker-image + uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.8 + with: + docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }} + docker-image-name: manylinux2_28-builder + custom-tag-prefix: rocm6.3 + docker-build-dir: .ci/docker + working-directory: pytorch + - name: Pull Docker image + uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.8 + with: + docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} + - name: Test Pytorch binary + uses: ./pytorch/.github/actions/test-pytorch-binary + env: + DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }} +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: Teardown ROCm uses: ./.github/actions/teardown-rocm manywheel-py3_11-rocm6_3-upload: # Uploading @@ -1877,8 +2787,13 @@ jobs: DESIRED_CUDA: rocm6.3 GPU_ARCH_VERSION: 6.3 GPU_ARCH_TYPE: rocm +<<<<<<< HEAD DOCKER_IMAGE: pytorch/manylinux2_28-builder:rocm6.3-2.7 DESIRED_DEVTOOLSET: cxx11-abi +======= + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: rocm6.3 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) use_split_build: False DESIRED_PYTHON: "3.11" build_name: manywheel-py3_11-rocm6_3 @@ -1886,6 +2801,122 @@ jobs: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml +<<<<<<< HEAD +======= + manywheel-py3_11-rocm6_4-build: + if: ${{ github.repository_owner == 'pytorch' }} + uses: ./.github/workflows/_binary-build-linux.yml + needs: get-label-type + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: rocm6.4 + GPU_ARCH_VERSION: 6.4 + GPU_ARCH_TYPE: rocm + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: rocm6.4 + use_split_build: False + DESIRED_PYTHON: "3.11" + runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + build_name: manywheel-py3_11-rocm6_4 + build_environment: linux-binary-manywheel + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + manywheel-py3_11-rocm6_4-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: + - manywheel-py3_11-rocm6_4-build + - get-label-type + runs-on: linux.rocm.gpu.mi250 + timeout-minutes: 240 + env: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: rocm6.4 + GPU_ARCH_VERSION: 6.4 + GPU_ARCH_TYPE: rocm + SKIP_ALL_TESTS: 1 + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: rocm6.4 + use_split_build: False + DESIRED_PYTHON: "3.11" + steps: + - name: Setup ROCm + uses: ./.github/actions/setup-rocm + - uses: actions/download-artifact@v4.1.7 + name: Download Build Artifacts + with: + name: manywheel-py3_11-rocm6_4 + path: "${{ runner.temp }}/artifacts/" + - name: Checkout PyTorch + uses: actions/checkout@v4 + with: + submodules: recursive + path: pytorch + show-progress: false + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: ROCm set GPU_FLAG + run: | + echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}" + - name: configure aws credentials + id: aws_creds + if: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') }} + uses: aws-actions/configure-aws-credentials@v4 + with: + role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only + aws-region: us-east-1 + role-duration-seconds: 18000 + - name: Calculate docker image + id: calculate-docker-image + uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.8 + with: + docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }} + docker-image-name: manylinux2_28-builder + custom-tag-prefix: rocm6.4 + docker-build-dir: .ci/docker + working-directory: pytorch + - name: Pull Docker image + uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.8 + with: + docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} + - name: Test Pytorch binary + uses: ./pytorch/.github/actions/test-pytorch-binary + env: + DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }} + - name: Teardown ROCm + uses: ./.github/actions/teardown-rocm + manywheel-py3_11-rocm6_4-upload: # Uploading + if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read + needs: manywheel-py3_11-rocm6_4-test + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: rocm6.4 + GPU_ARCH_VERSION: 6.4 + GPU_ARCH_TYPE: rocm + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: rocm6.4 + use_split_build: False + DESIRED_PYTHON: "3.11" + build_name: manywheel-py3_11-rocm6_4 + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + uses: ./.github/workflows/_binary-upload.yml + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) manywheel-py3_11-xpu-build: if: ${{ github.repository_owner == 'pytorch' }} uses: ./.github/workflows/_binary-build-linux.yml @@ -1897,14 +2928,23 @@ jobs: # favor of GPU_ARCH_VERSION DESIRED_CUDA: xpu GPU_ARCH_TYPE: xpu +<<<<<<< HEAD DOCKER_IMAGE: pytorch/manylinux2_28-builder:xpu-2.7 DESIRED_DEVTOOLSET: cxx11-abi +======= + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: xpu +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) use_split_build: False DESIRED_PYTHON: "3.11" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build_name: manywheel-py3_11-xpu build_environment: linux-binary-manywheel +<<<<<<< HEAD PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.0.4; platform_system == 'Linux' | intel-cmplr-lib-ur==2025.0.4; platform_system == 'Linux' | intel-cmplr-lic-rt==2025.0.4; platform_system == 'Linux' | intel-sycl-rt==2025.0.4; platform_system == 'Linux' | intel-cmplr-lib-rt==2025.0.5; platform_system == 'Windows' | intel-cmplr-lib-ur==2025.0.5; platform_system == 'Windows' | intel-cmplr-lic-rt==2025.0.5; platform_system == 'Windows' | intel-sycl-rt==2025.0.5; platform_system == 'Windows' | tcmlib==1.2.0 | umf==0.9.1 | intel-pti==0.10.1 +======= + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.1.1 | intel-cmplr-lib-ur==2025.1.1 | intel-cmplr-lic-rt==2025.1.1 | intel-sycl-rt==2025.1.1 | oneccl-devel==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.15.0; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.1.0 | onemkl-sycl-dft==2025.1.0 | onemkl-sycl-lapack==2025.1.0 | onemkl-sycl-rng==2025.1.0 | onemkl-sycl-sparse==2025.1.0 | dpcpp-cpp-rt==2025.1.1 | intel-opencl-rt==2025.1.1 | mkl==2025.1.0 | intel-openmp==2025.1.1 | tbb==2022.1.0 | tcmlib==1.3.0 | umf==0.10.0 | intel-pti==0.12.3 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_11-xpu-test: # Testing @@ -1922,8 +2962,13 @@ jobs: DESIRED_CUDA: xpu GPU_ARCH_TYPE: xpu SKIP_ALL_TESTS: 1 +<<<<<<< HEAD DOCKER_IMAGE: pytorch/manylinux2_28-builder:xpu-2.7 DESIRED_DEVTOOLSET: cxx11-abi +======= + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: xpu +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) use_split_build: False DESIRED_PYTHON: "3.11" permissions: @@ -1934,7 +2979,11 @@ jobs: uses: ./.github/actions/setup-xpu - name: configure aws credentials id: aws_creds +<<<<<<< HEAD uses: aws-actions/configure-aws-credentials@v1.7.0 +======= + uses: aws-actions/configure-aws-credentials@v4 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only aws-region: us-east-1 @@ -1957,12 +3006,32 @@ jobs: # Remove any artifacts from the previous checkouts git clean -fxd working-directory: pytorch +<<<<<<< HEAD - name: Pull Docker image uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.7 with: docker-image: pytorch/manylinux2_28-builder:xpu-2.7 - name: Test Pytorch binary uses: ./pytorch/.github/actions/test-pytorch-binary +======= + - name: Calculate docker image + id: calculate-docker-image + uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.8 + with: + docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }} + docker-image-name: manylinux2_28-builder + custom-tag-prefix: xpu + docker-build-dir: .ci/docker + working-directory: pytorch + - name: Pull Docker image + uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.8 + with: + docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} + - name: Test Pytorch binary + uses: ./pytorch/.github/actions/test-pytorch-binary + env: + DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }} +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: Teardown XPU uses: ./.github/actions/teardown-xpu manywheel-py3_11-xpu-upload: # Uploading @@ -1978,8 +3047,13 @@ jobs: # favor of GPU_ARCH_VERSION DESIRED_CUDA: xpu GPU_ARCH_TYPE: xpu +<<<<<<< HEAD DOCKER_IMAGE: pytorch/manylinux2_28-builder:xpu-2.7 DESIRED_DEVTOOLSET: cxx11-abi +======= + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: xpu +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) use_split_build: False DESIRED_PYTHON: "3.11" build_name: manywheel-py3_11-xpu @@ -1998,8 +3072,13 @@ jobs: # favor of GPU_ARCH_VERSION DESIRED_CUDA: cpu GPU_ARCH_TYPE: cpu +<<<<<<< HEAD DOCKER_IMAGE: pytorch/manylinux2_28-builder:cpu-2.7 DESIRED_DEVTOOLSET: cxx11-abi +======= + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: cpu +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) use_split_build: False DESIRED_PYTHON: "3.12" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" @@ -2020,8 +3099,13 @@ jobs: # favor of GPU_ARCH_VERSION DESIRED_CUDA: cpu GPU_ARCH_TYPE: cpu +<<<<<<< HEAD DOCKER_IMAGE: pytorch/manylinux2_28-builder:cpu-2.7 DESIRED_DEVTOOLSET: cxx11-abi +======= + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: cpu +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) use_split_build: False DESIRED_PYTHON: "3.12" build_name: manywheel-py3_12-cpu @@ -2043,8 +3127,13 @@ jobs: # favor of GPU_ARCH_VERSION DESIRED_CUDA: cpu GPU_ARCH_TYPE: cpu +<<<<<<< HEAD DOCKER_IMAGE: pytorch/manylinux2_28-builder:cpu-2.7 DESIRED_DEVTOOLSET: cxx11-abi +======= + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: cpu +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) use_split_build: False DESIRED_PYTHON: "3.12" build_name: manywheel-py3_12-cpu @@ -2052,6 +3141,7 @@ jobs: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml +<<<<<<< HEAD manywheel-py3_12-cpu-cxx11-abi-build: if: ${{ github.repository_owner == 'pytorch' }} uses: ./.github/workflows/_binary-build-linux.yml @@ -2186,6 +3276,8 @@ jobs: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) manywheel-py3_12-cuda12_6-build: if: ${{ github.repository_owner == 'pytorch' }} uses: ./.github/workflows/_binary-build-linux.yml @@ -2198,14 +3290,23 @@ jobs: DESIRED_CUDA: cu126 GPU_ARCH_VERSION: 12.6 GPU_ARCH_TYPE: cuda +<<<<<<< HEAD DOCKER_IMAGE: pytorch/manylinux2_28-builder:cuda12.6-2.7 DESIRED_DEVTOOLSET: cxx11-abi +======= + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.6 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) use_split_build: False DESIRED_PYTHON: "3.12" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build_name: manywheel-py3_12-cuda12_6 build_environment: linux-binary-manywheel +<<<<<<< HEAD PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64' +======= + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64' +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_12-cuda12_6-test: # Testing @@ -2222,14 +3323,23 @@ jobs: DESIRED_CUDA: cu126 GPU_ARCH_VERSION: 12.6 GPU_ARCH_TYPE: cuda +<<<<<<< HEAD DOCKER_IMAGE: pytorch/manylinux2_28-builder:cuda12.6-2.7 DESIRED_DEVTOOLSET: cxx11-abi +======= + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.6 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) use_split_build: False DESIRED_PYTHON: "3.12" build_name: manywheel-py3_12-cuda12_6 build_environment: linux-binary-manywheel runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" +<<<<<<< HEAD runs_on: linux.4xlarge.nvidia.gpu +======= + runs_on: linux.4xlarge.nvidia.gpu # for other cuda versions, we use 4xlarge runner +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_12-cuda12_6-upload: # Uploading @@ -2246,8 +3356,13 @@ jobs: DESIRED_CUDA: cu126 GPU_ARCH_VERSION: 12.6 GPU_ARCH_TYPE: cuda +<<<<<<< HEAD DOCKER_IMAGE: pytorch/manylinux2_28-builder:cuda12.6-2.7 DESIRED_DEVTOOLSET: cxx11-abi +======= + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.6 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) use_split_build: False DESIRED_PYTHON: "3.12" build_name: manywheel-py3_12-cuda12_6 @@ -2267,14 +3382,23 @@ jobs: DESIRED_CUDA: cu128 GPU_ARCH_VERSION: 12.8 GPU_ARCH_TYPE: cuda +<<<<<<< HEAD DOCKER_IMAGE: pytorch/manylinux2_28-builder:cuda12.8-2.7 DESIRED_DEVTOOLSET: cxx11-abi +======= + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) use_split_build: False DESIRED_PYTHON: "3.12" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build_name: manywheel-py3_12-cuda12_8 build_environment: linux-binary-manywheel +<<<<<<< HEAD PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.7.1.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.3.14; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.41; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.2.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.7.53; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.0.11; platform_system == 'Linux' and platform_machine == 'x86_64' +======= + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_12-cuda12_8-test: # Testing @@ -2291,14 +3415,23 @@ jobs: DESIRED_CUDA: cu128 GPU_ARCH_VERSION: 12.8 GPU_ARCH_TYPE: cuda +<<<<<<< HEAD DOCKER_IMAGE: pytorch/manylinux2_28-builder:cuda12.8-2.7 DESIRED_DEVTOOLSET: cxx11-abi +======= + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) use_split_build: False DESIRED_PYTHON: "3.12" build_name: manywheel-py3_12-cuda12_8 build_environment: linux-binary-manywheel runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" +<<<<<<< HEAD runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8 build needs sm_70+ runner +======= + runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8 and 12.9 build need sm_70+ runner +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_12-cuda12_8-upload: # Uploading @@ -2315,8 +3448,13 @@ jobs: DESIRED_CUDA: cu128 GPU_ARCH_VERSION: 12.8 GPU_ARCH_TYPE: cuda +<<<<<<< HEAD DOCKER_IMAGE: pytorch/manylinux2_28-builder:cuda12.8-2.7 DESIRED_DEVTOOLSET: cxx11-abi +======= + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) use_split_build: False DESIRED_PYTHON: "3.12" build_name: manywheel-py3_12-cuda12_8 @@ -2324,7 +3462,11 @@ jobs: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml +<<<<<<< HEAD manywheel-py3_12-rocm6_2_4-build: +======= + manywheel-py3_12-cuda12_9-build: +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if: ${{ github.repository_owner == 'pytorch' }} uses: ./.github/workflows/_binary-build-linux.yml needs: get-label-type @@ -2333,6 +3475,7 @@ jobs: PACKAGE_TYPE: manywheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION +<<<<<<< HEAD DESIRED_CUDA: rocm6.2.4 GPU_ARCH_VERSION: 6.2.4 GPU_ARCH_TYPE: rocm @@ -2401,11 +3544,33 @@ jobs: id-token: write contents: read needs: manywheel-py3_12-rocm6_2_4-test +======= + DESIRED_CUDA: cu129 + GPU_ARCH_VERSION: 12.9 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.9 + use_split_build: False + DESIRED_PYTHON: "3.12" + runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + build_name: manywheel-py3_12-cuda12_9 + build_environment: linux-binary-manywheel + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64' + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + manywheel-py3_12-cuda12_9-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: + - manywheel-py3_12-cuda12_9-build + - get-label-type + uses: ./.github/workflows/_binary-test-linux.yml +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: PYTORCH_ROOT: /pytorch PACKAGE_TYPE: manywheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION +<<<<<<< HEAD DESIRED_CUDA: rocm6.2.4 GPU_ARCH_VERSION: 6.2.4 GPU_ARCH_TYPE: rocm @@ -2414,6 +3579,40 @@ jobs: use_split_build: False DESIRED_PYTHON: "3.12" build_name: manywheel-py3_12-rocm6_2_4 +======= + DESIRED_CUDA: cu129 + GPU_ARCH_VERSION: 12.9 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.9 + use_split_build: False + DESIRED_PYTHON: "3.12" + build_name: manywheel-py3_12-cuda12_9 + build_environment: linux-binary-manywheel + runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8 and 12.9 build need sm_70+ runner + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + manywheel-py3_12-cuda12_9-upload: # Uploading + if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read + needs: manywheel-py3_12-cuda12_9-test + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu129 + GPU_ARCH_VERSION: 12.9 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.9 + use_split_build: False + DESIRED_PYTHON: "3.12" + build_name: manywheel-py3_12-cuda12_9 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml @@ -2430,8 +3629,13 @@ jobs: DESIRED_CUDA: rocm6.3 GPU_ARCH_VERSION: 6.3 GPU_ARCH_TYPE: rocm +<<<<<<< HEAD DOCKER_IMAGE: pytorch/manylinux2_28-builder:rocm6.3-2.7 DESIRED_DEVTOOLSET: cxx11-abi +======= + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: rocm6.3 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) use_split_build: False DESIRED_PYTHON: "3.12" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" @@ -2444,7 +3648,11 @@ jobs: needs: - manywheel-py3_12-rocm6_3-build - get-label-type +<<<<<<< HEAD runs-on: linux.rocm.gpu +======= + runs-on: linux.rocm.gpu.mi250 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) timeout-minutes: 240 env: PYTORCH_ROOT: /pytorch @@ -2455,8 +3663,13 @@ jobs: GPU_ARCH_VERSION: 6.3 GPU_ARCH_TYPE: rocm SKIP_ALL_TESTS: 1 +<<<<<<< HEAD DOCKER_IMAGE: pytorch/manylinux2_28-builder:rocm6.3-2.7 DESIRED_DEVTOOLSET: cxx11-abi +======= + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: rocm6.3 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) use_split_build: False DESIRED_PYTHON: "3.12" steps: @@ -2481,37 +3694,186 @@ jobs: - name: ROCm set GPU_FLAG run: | echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}" +<<<<<<< HEAD - name: Pull Docker image uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.7 with: docker-image: pytorch/manylinux2_28-builder:rocm6.3-2.7 - name: Test Pytorch binary uses: ./pytorch/.github/actions/test-pytorch-binary +======= + - name: configure aws credentials + id: aws_creds + if: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') }} + uses: aws-actions/configure-aws-credentials@v4 + with: + role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only + aws-region: us-east-1 + role-duration-seconds: 18000 + - name: Calculate docker image + id: calculate-docker-image + uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.8 + with: + docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }} + docker-image-name: manylinux2_28-builder + custom-tag-prefix: rocm6.3 + docker-build-dir: .ci/docker + working-directory: pytorch + - name: Pull Docker image + uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.8 + with: + docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} + - name: Test Pytorch binary + uses: ./pytorch/.github/actions/test-pytorch-binary + env: + DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }} +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) + - name: Teardown ROCm + uses: ./.github/actions/teardown-rocm + manywheel-py3_12-rocm6_3-upload: # Uploading + if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read + needs: manywheel-py3_12-rocm6_3-test + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: rocm6.3 + GPU_ARCH_VERSION: 6.3 + GPU_ARCH_TYPE: rocm +<<<<<<< HEAD + DOCKER_IMAGE: pytorch/manylinux2_28-builder:rocm6.3-2.7 + DESIRED_DEVTOOLSET: cxx11-abi +======= + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: rocm6.3 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) + use_split_build: False + DESIRED_PYTHON: "3.12" + build_name: manywheel-py3_12-rocm6_3 + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + uses: ./.github/workflows/_binary-upload.yml + +<<<<<<< HEAD +======= + manywheel-py3_12-rocm6_4-build: + if: ${{ github.repository_owner == 'pytorch' }} + uses: ./.github/workflows/_binary-build-linux.yml + needs: get-label-type + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: rocm6.4 + GPU_ARCH_VERSION: 6.4 + GPU_ARCH_TYPE: rocm + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: rocm6.4 + use_split_build: False + DESIRED_PYTHON: "3.12" + runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + build_name: manywheel-py3_12-rocm6_4 + build_environment: linux-binary-manywheel + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + manywheel-py3_12-rocm6_4-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: + - manywheel-py3_12-rocm6_4-build + - get-label-type + runs-on: linux.rocm.gpu.mi250 + timeout-minutes: 240 + env: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: rocm6.4 + GPU_ARCH_VERSION: 6.4 + GPU_ARCH_TYPE: rocm + SKIP_ALL_TESTS: 1 + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: rocm6.4 + use_split_build: False + DESIRED_PYTHON: "3.12" + steps: + - name: Setup ROCm + uses: ./.github/actions/setup-rocm + - uses: actions/download-artifact@v4.1.7 + name: Download Build Artifacts + with: + name: manywheel-py3_12-rocm6_4 + path: "${{ runner.temp }}/artifacts/" + - name: Checkout PyTorch + uses: actions/checkout@v4 + with: + submodules: recursive + path: pytorch + show-progress: false + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: ROCm set GPU_FLAG + run: | + echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}" + - name: configure aws credentials + id: aws_creds + if: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') }} + uses: aws-actions/configure-aws-credentials@v4 + with: + role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only + aws-region: us-east-1 + role-duration-seconds: 18000 + - name: Calculate docker image + id: calculate-docker-image + uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.8 + with: + docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }} + docker-image-name: manylinux2_28-builder + custom-tag-prefix: rocm6.4 + docker-build-dir: .ci/docker + working-directory: pytorch + - name: Pull Docker image + uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.8 + with: + docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} + - name: Test Pytorch binary + uses: ./pytorch/.github/actions/test-pytorch-binary + env: + DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }} - name: Teardown ROCm uses: ./.github/actions/teardown-rocm - manywheel-py3_12-rocm6_3-upload: # Uploading + manywheel-py3_12-rocm6_4-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} permissions: id-token: write contents: read - needs: manywheel-py3_12-rocm6_3-test + needs: manywheel-py3_12-rocm6_4-test with: PYTORCH_ROOT: /pytorch PACKAGE_TYPE: manywheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: rocm6.3 - GPU_ARCH_VERSION: 6.3 + DESIRED_CUDA: rocm6.4 + GPU_ARCH_VERSION: 6.4 GPU_ARCH_TYPE: rocm - DOCKER_IMAGE: pytorch/manylinux2_28-builder:rocm6.3-2.7 - DESIRED_DEVTOOLSET: cxx11-abi + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: rocm6.4 use_split_build: False DESIRED_PYTHON: "3.12" - build_name: manywheel-py3_12-rocm6_3 + build_name: manywheel-py3_12-rocm6_4 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) manywheel-py3_12-xpu-build: if: ${{ github.repository_owner == 'pytorch' }} uses: ./.github/workflows/_binary-build-linux.yml @@ -2523,14 +3885,23 @@ jobs: # favor of GPU_ARCH_VERSION DESIRED_CUDA: xpu GPU_ARCH_TYPE: xpu +<<<<<<< HEAD DOCKER_IMAGE: pytorch/manylinux2_28-builder:xpu-2.7 DESIRED_DEVTOOLSET: cxx11-abi +======= + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: xpu +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) use_split_build: False DESIRED_PYTHON: "3.12" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build_name: manywheel-py3_12-xpu build_environment: linux-binary-manywheel +<<<<<<< HEAD PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.0.4; platform_system == 'Linux' | intel-cmplr-lib-ur==2025.0.4; platform_system == 'Linux' | intel-cmplr-lic-rt==2025.0.4; platform_system == 'Linux' | intel-sycl-rt==2025.0.4; platform_system == 'Linux' | intel-cmplr-lib-rt==2025.0.5; platform_system == 'Windows' | intel-cmplr-lib-ur==2025.0.5; platform_system == 'Windows' | intel-cmplr-lic-rt==2025.0.5; platform_system == 'Windows' | intel-sycl-rt==2025.0.5; platform_system == 'Windows' | tcmlib==1.2.0 | umf==0.9.1 | intel-pti==0.10.1 +======= + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.1.1 | intel-cmplr-lib-ur==2025.1.1 | intel-cmplr-lic-rt==2025.1.1 | intel-sycl-rt==2025.1.1 | oneccl-devel==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.15.0; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.1.0 | onemkl-sycl-dft==2025.1.0 | onemkl-sycl-lapack==2025.1.0 | onemkl-sycl-rng==2025.1.0 | onemkl-sycl-sparse==2025.1.0 | dpcpp-cpp-rt==2025.1.1 | intel-opencl-rt==2025.1.1 | mkl==2025.1.0 | intel-openmp==2025.1.1 | tbb==2022.1.0 | tcmlib==1.3.0 | umf==0.10.0 | intel-pti==0.12.3 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_12-xpu-test: # Testing @@ -2548,8 +3919,13 @@ jobs: DESIRED_CUDA: xpu GPU_ARCH_TYPE: xpu SKIP_ALL_TESTS: 1 +<<<<<<< HEAD DOCKER_IMAGE: pytorch/manylinux2_28-builder:xpu-2.7 DESIRED_DEVTOOLSET: cxx11-abi +======= + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: xpu +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) use_split_build: False DESIRED_PYTHON: "3.12" permissions: @@ -2560,7 +3936,11 @@ jobs: uses: ./.github/actions/setup-xpu - name: configure aws credentials id: aws_creds +<<<<<<< HEAD uses: aws-actions/configure-aws-credentials@v1.7.0 +======= + uses: aws-actions/configure-aws-credentials@v4 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only aws-region: us-east-1 @@ -2583,12 +3963,32 @@ jobs: # Remove any artifacts from the previous checkouts git clean -fxd working-directory: pytorch +<<<<<<< HEAD - name: Pull Docker image uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.7 with: docker-image: pytorch/manylinux2_28-builder:xpu-2.7 - name: Test Pytorch binary uses: ./pytorch/.github/actions/test-pytorch-binary +======= + - name: Calculate docker image + id: calculate-docker-image + uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.8 + with: + docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }} + docker-image-name: manylinux2_28-builder + custom-tag-prefix: xpu + docker-build-dir: .ci/docker + working-directory: pytorch + - name: Pull Docker image + uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.8 + with: + docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} + - name: Test Pytorch binary + uses: ./pytorch/.github/actions/test-pytorch-binary + env: + DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }} +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: Teardown XPU uses: ./.github/actions/teardown-xpu manywheel-py3_12-xpu-upload: # Uploading @@ -2604,8 +4004,13 @@ jobs: # favor of GPU_ARCH_VERSION DESIRED_CUDA: xpu GPU_ARCH_TYPE: xpu +<<<<<<< HEAD DOCKER_IMAGE: pytorch/manylinux2_28-builder:xpu-2.7 DESIRED_DEVTOOLSET: cxx11-abi +======= + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: xpu +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) use_split_build: False DESIRED_PYTHON: "3.12" build_name: manywheel-py3_12-xpu @@ -2624,8 +4029,13 @@ jobs: # favor of GPU_ARCH_VERSION DESIRED_CUDA: cpu GPU_ARCH_TYPE: cpu +<<<<<<< HEAD DOCKER_IMAGE: pytorch/manylinux2_28-builder:cpu-2.7 DESIRED_DEVTOOLSET: cxx11-abi +======= + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: cpu +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) use_split_build: False DESIRED_PYTHON: "3.13" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" @@ -2646,8 +4056,13 @@ jobs: # favor of GPU_ARCH_VERSION DESIRED_CUDA: cpu GPU_ARCH_TYPE: cpu +<<<<<<< HEAD DOCKER_IMAGE: pytorch/manylinux2_28-builder:cpu-2.7 DESIRED_DEVTOOLSET: cxx11-abi +======= + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: cpu +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) use_split_build: False DESIRED_PYTHON: "3.13" build_name: manywheel-py3_13-cpu @@ -2669,8 +4084,13 @@ jobs: # favor of GPU_ARCH_VERSION DESIRED_CUDA: cpu GPU_ARCH_TYPE: cpu +<<<<<<< HEAD DOCKER_IMAGE: pytorch/manylinux2_28-builder:cpu-2.7 DESIRED_DEVTOOLSET: cxx11-abi +======= + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: cpu +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) use_split_build: False DESIRED_PYTHON: "3.13" build_name: manywheel-py3_13-cpu @@ -2678,6 +4098,7 @@ jobs: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml +<<<<<<< HEAD manywheel-py3_13-cpu-cxx11-abi-build: if: ${{ github.repository_owner == 'pytorch' }} uses: ./.github/workflows/_binary-build-linux.yml @@ -2812,6 +4233,8 @@ jobs: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) manywheel-py3_13-cuda12_6-build: if: ${{ github.repository_owner == 'pytorch' }} uses: ./.github/workflows/_binary-build-linux.yml @@ -2824,14 +4247,23 @@ jobs: DESIRED_CUDA: cu126 GPU_ARCH_VERSION: 12.6 GPU_ARCH_TYPE: cuda +<<<<<<< HEAD DOCKER_IMAGE: pytorch/manylinux2_28-builder:cuda12.6-2.7 DESIRED_DEVTOOLSET: cxx11-abi +======= + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.6 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) use_split_build: False DESIRED_PYTHON: "3.13" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build_name: manywheel-py3_13-cuda12_6 build_environment: linux-binary-manywheel +<<<<<<< HEAD PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64' +======= + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64' +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_13-cuda12_6-test: # Testing @@ -2848,14 +4280,23 @@ jobs: DESIRED_CUDA: cu126 GPU_ARCH_VERSION: 12.6 GPU_ARCH_TYPE: cuda +<<<<<<< HEAD DOCKER_IMAGE: pytorch/manylinux2_28-builder:cuda12.6-2.7 DESIRED_DEVTOOLSET: cxx11-abi +======= + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.6 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) use_split_build: False DESIRED_PYTHON: "3.13" build_name: manywheel-py3_13-cuda12_6 build_environment: linux-binary-manywheel runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" +<<<<<<< HEAD runs_on: linux.4xlarge.nvidia.gpu +======= + runs_on: linux.4xlarge.nvidia.gpu # for other cuda versions, we use 4xlarge runner +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_13-cuda12_6-upload: # Uploading @@ -2872,8 +4313,13 @@ jobs: DESIRED_CUDA: cu126 GPU_ARCH_VERSION: 12.6 GPU_ARCH_TYPE: cuda +<<<<<<< HEAD DOCKER_IMAGE: pytorch/manylinux2_28-builder:cuda12.6-2.7 DESIRED_DEVTOOLSET: cxx11-abi +======= + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.6 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) use_split_build: False DESIRED_PYTHON: "3.13" build_name: manywheel-py3_13-cuda12_6 @@ -2893,14 +4339,23 @@ jobs: DESIRED_CUDA: cu128 GPU_ARCH_VERSION: 12.8 GPU_ARCH_TYPE: cuda +<<<<<<< HEAD DOCKER_IMAGE: pytorch/manylinux2_28-builder:cuda12.8-2.7 DESIRED_DEVTOOLSET: cxx11-abi +======= + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) use_split_build: False DESIRED_PYTHON: "3.13" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build_name: manywheel-py3_13-cuda12_8 build_environment: linux-binary-manywheel +<<<<<<< HEAD PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.7.1.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.3.14; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.41; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.2.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.7.53; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.0.11; platform_system == 'Linux' and platform_machine == 'x86_64' +======= + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_13-cuda12_8-test: # Testing @@ -2917,14 +4372,23 @@ jobs: DESIRED_CUDA: cu128 GPU_ARCH_VERSION: 12.8 GPU_ARCH_TYPE: cuda +<<<<<<< HEAD DOCKER_IMAGE: pytorch/manylinux2_28-builder:cuda12.8-2.7 DESIRED_DEVTOOLSET: cxx11-abi +======= + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) use_split_build: False DESIRED_PYTHON: "3.13" build_name: manywheel-py3_13-cuda12_8 build_environment: linux-binary-manywheel runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" +<<<<<<< HEAD runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8 build needs sm_70+ runner +======= + runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8 and 12.9 build need sm_70+ runner +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_13-cuda12_8-upload: # Uploading @@ -2941,8 +4405,13 @@ jobs: DESIRED_CUDA: cu128 GPU_ARCH_VERSION: 12.8 GPU_ARCH_TYPE: cuda +<<<<<<< HEAD DOCKER_IMAGE: pytorch/manylinux2_28-builder:cuda12.8-2.7 DESIRED_DEVTOOLSET: cxx11-abi +======= + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) use_split_build: False DESIRED_PYTHON: "3.13" build_name: manywheel-py3_13-cuda12_8 @@ -2950,7 +4419,11 @@ jobs: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml +<<<<<<< HEAD manywheel-py3_13-rocm6_2_4-build: +======= + manywheel-py3_13-cuda12_9-build: +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if: ${{ github.repository_owner == 'pytorch' }} uses: ./.github/workflows/_binary-build-linux.yml needs: get-label-type @@ -2959,6 +4432,7 @@ jobs: PACKAGE_TYPE: manywheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION +<<<<<<< HEAD DESIRED_CUDA: rocm6.2.4 GPU_ARCH_VERSION: 6.2.4 GPU_ARCH_TYPE: rocm @@ -3027,11 +4501,33 @@ jobs: id-token: write contents: read needs: manywheel-py3_13-rocm6_2_4-test +======= + DESIRED_CUDA: cu129 + GPU_ARCH_VERSION: 12.9 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.9 + use_split_build: False + DESIRED_PYTHON: "3.13" + runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + build_name: manywheel-py3_13-cuda12_9 + build_environment: linux-binary-manywheel + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64' + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + manywheel-py3_13-cuda12_9-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: + - manywheel-py3_13-cuda12_9-build + - get-label-type + uses: ./.github/workflows/_binary-test-linux.yml +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: PYTORCH_ROOT: /pytorch PACKAGE_TYPE: manywheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION +<<<<<<< HEAD DESIRED_CUDA: rocm6.2.4 GPU_ARCH_VERSION: 6.2.4 GPU_ARCH_TYPE: rocm @@ -3040,6 +4536,40 @@ jobs: use_split_build: False DESIRED_PYTHON: "3.13" build_name: manywheel-py3_13-rocm6_2_4 +======= + DESIRED_CUDA: cu129 + GPU_ARCH_VERSION: 12.9 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.9 + use_split_build: False + DESIRED_PYTHON: "3.13" + build_name: manywheel-py3_13-cuda12_9 + build_environment: linux-binary-manywheel + runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8 and 12.9 build need sm_70+ runner + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + manywheel-py3_13-cuda12_9-upload: # Uploading + if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read + needs: manywheel-py3_13-cuda12_9-test + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu129 + GPU_ARCH_VERSION: 12.9 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.9 + use_split_build: False + DESIRED_PYTHON: "3.13" + build_name: manywheel-py3_13-cuda12_9 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml @@ -3056,8 +4586,13 @@ jobs: DESIRED_CUDA: rocm6.3 GPU_ARCH_VERSION: 6.3 GPU_ARCH_TYPE: rocm +<<<<<<< HEAD DOCKER_IMAGE: pytorch/manylinux2_28-builder:rocm6.3-2.7 DESIRED_DEVTOOLSET: cxx11-abi +======= + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: rocm6.3 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) use_split_build: False DESIRED_PYTHON: "3.13" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" @@ -3070,7 +4605,11 @@ jobs: needs: - manywheel-py3_13-rocm6_3-build - get-label-type +<<<<<<< HEAD runs-on: linux.rocm.gpu +======= + runs-on: linux.rocm.gpu.mi250 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) timeout-minutes: 240 env: PYTORCH_ROOT: /pytorch @@ -3081,8 +4620,13 @@ jobs: GPU_ARCH_VERSION: 6.3 GPU_ARCH_TYPE: rocm SKIP_ALL_TESTS: 1 +<<<<<<< HEAD DOCKER_IMAGE: pytorch/manylinux2_28-builder:rocm6.3-2.7 DESIRED_DEVTOOLSET: cxx11-abi +======= + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: rocm6.3 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) use_split_build: False DESIRED_PYTHON: "3.13" steps: @@ -3107,12 +4651,40 @@ jobs: - name: ROCm set GPU_FLAG run: | echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}" +<<<<<<< HEAD - name: Pull Docker image uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.7 with: docker-image: pytorch/manylinux2_28-builder:rocm6.3-2.7 - name: Test Pytorch binary uses: ./pytorch/.github/actions/test-pytorch-binary +======= + - name: configure aws credentials + id: aws_creds + if: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') }} + uses: aws-actions/configure-aws-credentials@v4 + with: + role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only + aws-region: us-east-1 + role-duration-seconds: 18000 + - name: Calculate docker image + id: calculate-docker-image + uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.8 + with: + docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }} + docker-image-name: manylinux2_28-builder + custom-tag-prefix: rocm6.3 + docker-build-dir: .ci/docker + working-directory: pytorch + - name: Pull Docker image + uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.8 + with: + docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} + - name: Test Pytorch binary + uses: ./pytorch/.github/actions/test-pytorch-binary + env: + DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }} +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: Teardown ROCm uses: ./.github/actions/teardown-rocm manywheel-py3_13-rocm6_3-upload: # Uploading @@ -3129,8 +4701,13 @@ jobs: DESIRED_CUDA: rocm6.3 GPU_ARCH_VERSION: 6.3 GPU_ARCH_TYPE: rocm +<<<<<<< HEAD DOCKER_IMAGE: pytorch/manylinux2_28-builder:rocm6.3-2.7 DESIRED_DEVTOOLSET: cxx11-abi +======= + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: rocm6.3 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) use_split_build: False DESIRED_PYTHON: "3.13" build_name: manywheel-py3_13-rocm6_3 @@ -3138,6 +4715,122 @@ jobs: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml +<<<<<<< HEAD +======= + manywheel-py3_13-rocm6_4-build: + if: ${{ github.repository_owner == 'pytorch' }} + uses: ./.github/workflows/_binary-build-linux.yml + needs: get-label-type + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: rocm6.4 + GPU_ARCH_VERSION: 6.4 + GPU_ARCH_TYPE: rocm + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: rocm6.4 + use_split_build: False + DESIRED_PYTHON: "3.13" + runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + build_name: manywheel-py3_13-rocm6_4 + build_environment: linux-binary-manywheel + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + manywheel-py3_13-rocm6_4-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: + - manywheel-py3_13-rocm6_4-build + - get-label-type + runs-on: linux.rocm.gpu.mi250 + timeout-minutes: 240 + env: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: rocm6.4 + GPU_ARCH_VERSION: 6.4 + GPU_ARCH_TYPE: rocm + SKIP_ALL_TESTS: 1 + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: rocm6.4 + use_split_build: False + DESIRED_PYTHON: "3.13" + steps: + - name: Setup ROCm + uses: ./.github/actions/setup-rocm + - uses: actions/download-artifact@v4.1.7 + name: Download Build Artifacts + with: + name: manywheel-py3_13-rocm6_4 + path: "${{ runner.temp }}/artifacts/" + - name: Checkout PyTorch + uses: actions/checkout@v4 + with: + submodules: recursive + path: pytorch + show-progress: false + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: ROCm set GPU_FLAG + run: | + echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}" + - name: configure aws credentials + id: aws_creds + if: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') }} + uses: aws-actions/configure-aws-credentials@v4 + with: + role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only + aws-region: us-east-1 + role-duration-seconds: 18000 + - name: Calculate docker image + id: calculate-docker-image + uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.8 + with: + docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }} + docker-image-name: manylinux2_28-builder + custom-tag-prefix: rocm6.4 + docker-build-dir: .ci/docker + working-directory: pytorch + - name: Pull Docker image + uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.8 + with: + docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} + - name: Test Pytorch binary + uses: ./pytorch/.github/actions/test-pytorch-binary + env: + DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }} + - name: Teardown ROCm + uses: ./.github/actions/teardown-rocm + manywheel-py3_13-rocm6_4-upload: # Uploading + if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read + needs: manywheel-py3_13-rocm6_4-test + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: rocm6.4 + GPU_ARCH_VERSION: 6.4 + GPU_ARCH_TYPE: rocm + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: rocm6.4 + use_split_build: False + DESIRED_PYTHON: "3.13" + build_name: manywheel-py3_13-rocm6_4 + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + uses: ./.github/workflows/_binary-upload.yml + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) manywheel-py3_13-xpu-build: if: ${{ github.repository_owner == 'pytorch' }} uses: ./.github/workflows/_binary-build-linux.yml @@ -3149,14 +4842,23 @@ jobs: # favor of GPU_ARCH_VERSION DESIRED_CUDA: xpu GPU_ARCH_TYPE: xpu +<<<<<<< HEAD DOCKER_IMAGE: pytorch/manylinux2_28-builder:xpu-2.7 DESIRED_DEVTOOLSET: cxx11-abi +======= + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: xpu +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) use_split_build: False DESIRED_PYTHON: "3.13" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build_name: manywheel-py3_13-xpu build_environment: linux-binary-manywheel +<<<<<<< HEAD PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.0.4; platform_system == 'Linux' | intel-cmplr-lib-ur==2025.0.4; platform_system == 'Linux' | intel-cmplr-lic-rt==2025.0.4; platform_system == 'Linux' | intel-sycl-rt==2025.0.4; platform_system == 'Linux' | intel-cmplr-lib-rt==2025.0.5; platform_system == 'Windows' | intel-cmplr-lib-ur==2025.0.5; platform_system == 'Windows' | intel-cmplr-lic-rt==2025.0.5; platform_system == 'Windows' | intel-sycl-rt==2025.0.5; platform_system == 'Windows' | tcmlib==1.2.0 | umf==0.9.1 | intel-pti==0.10.1 +======= + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.1.1 | intel-cmplr-lib-ur==2025.1.1 | intel-cmplr-lic-rt==2025.1.1 | intel-sycl-rt==2025.1.1 | oneccl-devel==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.15.0; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.1.0 | onemkl-sycl-dft==2025.1.0 | onemkl-sycl-lapack==2025.1.0 | onemkl-sycl-rng==2025.1.0 | onemkl-sycl-sparse==2025.1.0 | dpcpp-cpp-rt==2025.1.1 | intel-opencl-rt==2025.1.1 | mkl==2025.1.0 | intel-openmp==2025.1.1 | tbb==2022.1.0 | tcmlib==1.3.0 | umf==0.10.0 | intel-pti==0.12.3 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_13-xpu-test: # Testing @@ -3174,8 +4876,13 @@ jobs: DESIRED_CUDA: xpu GPU_ARCH_TYPE: xpu SKIP_ALL_TESTS: 1 +<<<<<<< HEAD DOCKER_IMAGE: pytorch/manylinux2_28-builder:xpu-2.7 DESIRED_DEVTOOLSET: cxx11-abi +======= + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: xpu +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) use_split_build: False DESIRED_PYTHON: "3.13" permissions: @@ -3186,7 +4893,11 @@ jobs: uses: ./.github/actions/setup-xpu - name: configure aws credentials id: aws_creds +<<<<<<< HEAD uses: aws-actions/configure-aws-credentials@v1.7.0 +======= + uses: aws-actions/configure-aws-credentials@v4 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only aws-region: us-east-1 @@ -3209,12 +4920,32 @@ jobs: # Remove any artifacts from the previous checkouts git clean -fxd working-directory: pytorch +<<<<<<< HEAD - name: Pull Docker image uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.7 with: docker-image: pytorch/manylinux2_28-builder:xpu-2.7 - name: Test Pytorch binary uses: ./pytorch/.github/actions/test-pytorch-binary +======= + - name: Calculate docker image + id: calculate-docker-image + uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.8 + with: + docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }} + docker-image-name: manylinux2_28-builder + custom-tag-prefix: xpu + docker-build-dir: .ci/docker + working-directory: pytorch + - name: Pull Docker image + uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.8 + with: + docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} + - name: Test Pytorch binary + uses: ./pytorch/.github/actions/test-pytorch-binary + env: + DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }} +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: Teardown XPU uses: ./.github/actions/teardown-xpu manywheel-py3_13-xpu-upload: # Uploading @@ -3230,8 +4961,13 @@ jobs: # favor of GPU_ARCH_VERSION DESIRED_CUDA: xpu GPU_ARCH_TYPE: xpu +<<<<<<< HEAD DOCKER_IMAGE: pytorch/manylinux2_28-builder:xpu-2.7 DESIRED_DEVTOOLSET: cxx11-abi +======= + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: xpu +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) use_split_build: False DESIRED_PYTHON: "3.13" build_name: manywheel-py3_13-xpu @@ -3250,8 +4986,13 @@ jobs: # favor of GPU_ARCH_VERSION DESIRED_CUDA: cpu GPU_ARCH_TYPE: cpu +<<<<<<< HEAD DOCKER_IMAGE: pytorch/manylinux2_28-builder:cpu-2.7 DESIRED_DEVTOOLSET: cxx11-abi +======= + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: cpu +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) use_split_build: False DESIRED_PYTHON: "3.13t" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" @@ -3272,8 +5013,13 @@ jobs: # favor of GPU_ARCH_VERSION DESIRED_CUDA: cpu GPU_ARCH_TYPE: cpu +<<<<<<< HEAD DOCKER_IMAGE: pytorch/manylinux2_28-builder:cpu-2.7 DESIRED_DEVTOOLSET: cxx11-abi +======= + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: cpu +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) use_split_build: False DESIRED_PYTHON: "3.13t" build_name: manywheel-py3_13t-cpu @@ -3295,8 +5041,13 @@ jobs: # favor of GPU_ARCH_VERSION DESIRED_CUDA: cpu GPU_ARCH_TYPE: cpu +<<<<<<< HEAD DOCKER_IMAGE: pytorch/manylinux2_28-builder:cpu-2.7 DESIRED_DEVTOOLSET: cxx11-abi +======= + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: cpu +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) use_split_build: False DESIRED_PYTHON: "3.13t" build_name: manywheel-py3_13t-cpu @@ -3304,6 +5055,7 @@ jobs: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml +<<<<<<< HEAD manywheel-py3_13t-cpu-cxx11-abi-build: if: ${{ github.repository_owner == 'pytorch' }} uses: ./.github/workflows/_binary-build-linux.yml @@ -3438,6 +5190,8 @@ jobs: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) manywheel-py3_13t-cuda12_6-build: if: ${{ github.repository_owner == 'pytorch' }} uses: ./.github/workflows/_binary-build-linux.yml @@ -3450,14 +5204,23 @@ jobs: DESIRED_CUDA: cu126 GPU_ARCH_VERSION: 12.6 GPU_ARCH_TYPE: cuda +<<<<<<< HEAD DOCKER_IMAGE: pytorch/manylinux2_28-builder:cuda12.6-2.7 DESIRED_DEVTOOLSET: cxx11-abi +======= + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.6 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) use_split_build: False DESIRED_PYTHON: "3.13t" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build_name: manywheel-py3_13t-cuda12_6 build_environment: linux-binary-manywheel +<<<<<<< HEAD PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64' +======= + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64' +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_13t-cuda12_6-test: # Testing @@ -3474,14 +5237,23 @@ jobs: DESIRED_CUDA: cu126 GPU_ARCH_VERSION: 12.6 GPU_ARCH_TYPE: cuda +<<<<<<< HEAD DOCKER_IMAGE: pytorch/manylinux2_28-builder:cuda12.6-2.7 DESIRED_DEVTOOLSET: cxx11-abi +======= + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.6 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) use_split_build: False DESIRED_PYTHON: "3.13t" build_name: manywheel-py3_13t-cuda12_6 build_environment: linux-binary-manywheel runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" +<<<<<<< HEAD runs_on: linux.4xlarge.nvidia.gpu +======= + runs_on: linux.4xlarge.nvidia.gpu # for other cuda versions, we use 4xlarge runner +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_13t-cuda12_6-upload: # Uploading @@ -3498,8 +5270,13 @@ jobs: DESIRED_CUDA: cu126 GPU_ARCH_VERSION: 12.6 GPU_ARCH_TYPE: cuda +<<<<<<< HEAD DOCKER_IMAGE: pytorch/manylinux2_28-builder:cuda12.6-2.7 DESIRED_DEVTOOLSET: cxx11-abi +======= + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.6 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) use_split_build: False DESIRED_PYTHON: "3.13t" build_name: manywheel-py3_13t-cuda12_6 @@ -3519,14 +5296,23 @@ jobs: DESIRED_CUDA: cu128 GPU_ARCH_VERSION: 12.8 GPU_ARCH_TYPE: cuda +<<<<<<< HEAD DOCKER_IMAGE: pytorch/manylinux2_28-builder:cuda12.8-2.7 DESIRED_DEVTOOLSET: cxx11-abi +======= + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) use_split_build: False DESIRED_PYTHON: "3.13t" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build_name: manywheel-py3_13t-cuda12_8 build_environment: linux-binary-manywheel +<<<<<<< HEAD PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.7.1.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.3.14; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.41; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.2.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.7.53; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.0.11; platform_system == 'Linux' and platform_machine == 'x86_64' +======= + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_13t-cuda12_8-test: # Testing @@ -3543,14 +5329,23 @@ jobs: DESIRED_CUDA: cu128 GPU_ARCH_VERSION: 12.8 GPU_ARCH_TYPE: cuda +<<<<<<< HEAD DOCKER_IMAGE: pytorch/manylinux2_28-builder:cuda12.8-2.7 DESIRED_DEVTOOLSET: cxx11-abi +======= + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) use_split_build: False DESIRED_PYTHON: "3.13t" build_name: manywheel-py3_13t-cuda12_8 build_environment: linux-binary-manywheel runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" +<<<<<<< HEAD runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8 build needs sm_70+ runner +======= + runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8 and 12.9 build need sm_70+ runner +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_13t-cuda12_8-upload: # Uploading @@ -3567,8 +5362,13 @@ jobs: DESIRED_CUDA: cu128 GPU_ARCH_VERSION: 12.8 GPU_ARCH_TYPE: cuda +<<<<<<< HEAD DOCKER_IMAGE: pytorch/manylinux2_28-builder:cuda12.8-2.7 DESIRED_DEVTOOLSET: cxx11-abi +======= + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) use_split_build: False DESIRED_PYTHON: "3.13t" build_name: manywheel-py3_13t-cuda12_8 @@ -3576,7 +5376,11 @@ jobs: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml +<<<<<<< HEAD manywheel-py3_13t-rocm6_2_4-build: +======= + manywheel-py3_13t-cuda12_9-build: +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if: ${{ github.repository_owner == 'pytorch' }} uses: ./.github/workflows/_binary-build-linux.yml needs: get-label-type @@ -3585,6 +5389,7 @@ jobs: PACKAGE_TYPE: manywheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION +<<<<<<< HEAD DESIRED_CUDA: rocm6.2.4 GPU_ARCH_VERSION: 6.2.4 GPU_ARCH_TYPE: rocm @@ -3653,11 +5458,33 @@ jobs: id-token: write contents: read needs: manywheel-py3_13t-rocm6_2_4-test +======= + DESIRED_CUDA: cu129 + GPU_ARCH_VERSION: 12.9 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.9 + use_split_build: False + DESIRED_PYTHON: "3.13t" + runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + build_name: manywheel-py3_13t-cuda12_9 + build_environment: linux-binary-manywheel + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64' + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + manywheel-py3_13t-cuda12_9-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: + - manywheel-py3_13t-cuda12_9-build + - get-label-type + uses: ./.github/workflows/_binary-test-linux.yml +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: PYTORCH_ROOT: /pytorch PACKAGE_TYPE: manywheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION +<<<<<<< HEAD DESIRED_CUDA: rocm6.2.4 GPU_ARCH_VERSION: 6.2.4 GPU_ARCH_TYPE: rocm @@ -3666,6 +5493,40 @@ jobs: use_split_build: False DESIRED_PYTHON: "3.13t" build_name: manywheel-py3_13t-rocm6_2_4 +======= + DESIRED_CUDA: cu129 + GPU_ARCH_VERSION: 12.9 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.9 + use_split_build: False + DESIRED_PYTHON: "3.13t" + build_name: manywheel-py3_13t-cuda12_9 + build_environment: linux-binary-manywheel + runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8 and 12.9 build need sm_70+ runner + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + manywheel-py3_13t-cuda12_9-upload: # Uploading + if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read + needs: manywheel-py3_13t-cuda12_9-test + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu129 + GPU_ARCH_VERSION: 12.9 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.9 + use_split_build: False + DESIRED_PYTHON: "3.13t" + build_name: manywheel-py3_13t-cuda12_9 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml @@ -3682,8 +5543,13 @@ jobs: DESIRED_CUDA: rocm6.3 GPU_ARCH_VERSION: 6.3 GPU_ARCH_TYPE: rocm +<<<<<<< HEAD DOCKER_IMAGE: pytorch/manylinux2_28-builder:rocm6.3-2.7 DESIRED_DEVTOOLSET: cxx11-abi +======= + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: rocm6.3 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) use_split_build: False DESIRED_PYTHON: "3.13t" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" @@ -3696,7 +5562,11 @@ jobs: needs: - manywheel-py3_13t-rocm6_3-build - get-label-type +<<<<<<< HEAD runs-on: linux.rocm.gpu +======= + runs-on: linux.rocm.gpu.mi250 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) timeout-minutes: 240 env: PYTORCH_ROOT: /pytorch @@ -3707,8 +5577,13 @@ jobs: GPU_ARCH_VERSION: 6.3 GPU_ARCH_TYPE: rocm SKIP_ALL_TESTS: 1 +<<<<<<< HEAD DOCKER_IMAGE: pytorch/manylinux2_28-builder:rocm6.3-2.7 DESIRED_DEVTOOLSET: cxx11-abi +======= + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: rocm6.3 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) use_split_build: False DESIRED_PYTHON: "3.13t" steps: @@ -3733,12 +5608,40 @@ jobs: - name: ROCm set GPU_FLAG run: | echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}" +<<<<<<< HEAD - name: Pull Docker image uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.7 with: docker-image: pytorch/manylinux2_28-builder:rocm6.3-2.7 - name: Test Pytorch binary uses: ./pytorch/.github/actions/test-pytorch-binary +======= + - name: configure aws credentials + id: aws_creds + if: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') }} + uses: aws-actions/configure-aws-credentials@v4 + with: + role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only + aws-region: us-east-1 + role-duration-seconds: 18000 + - name: Calculate docker image + id: calculate-docker-image + uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.8 + with: + docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }} + docker-image-name: manylinux2_28-builder + custom-tag-prefix: rocm6.3 + docker-build-dir: .ci/docker + working-directory: pytorch + - name: Pull Docker image + uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.8 + with: + docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} + - name: Test Pytorch binary + uses: ./pytorch/.github/actions/test-pytorch-binary + env: + DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }} +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: Teardown ROCm uses: ./.github/actions/teardown-rocm manywheel-py3_13t-rocm6_3-upload: # Uploading @@ -3755,8 +5658,13 @@ jobs: DESIRED_CUDA: rocm6.3 GPU_ARCH_VERSION: 6.3 GPU_ARCH_TYPE: rocm +<<<<<<< HEAD DOCKER_IMAGE: pytorch/manylinux2_28-builder:rocm6.3-2.7 DESIRED_DEVTOOLSET: cxx11-abi +======= + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: rocm6.3 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) use_split_build: False DESIRED_PYTHON: "3.13t" build_name: manywheel-py3_13t-rocm6_3 @@ -3764,6 +5672,122 @@ jobs: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml +<<<<<<< HEAD +======= + manywheel-py3_13t-rocm6_4-build: + if: ${{ github.repository_owner == 'pytorch' }} + uses: ./.github/workflows/_binary-build-linux.yml + needs: get-label-type + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: rocm6.4 + GPU_ARCH_VERSION: 6.4 + GPU_ARCH_TYPE: rocm + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: rocm6.4 + use_split_build: False + DESIRED_PYTHON: "3.13t" + runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + build_name: manywheel-py3_13t-rocm6_4 + build_environment: linux-binary-manywheel + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + manywheel-py3_13t-rocm6_4-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: + - manywheel-py3_13t-rocm6_4-build + - get-label-type + runs-on: linux.rocm.gpu.mi250 + timeout-minutes: 240 + env: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: rocm6.4 + GPU_ARCH_VERSION: 6.4 + GPU_ARCH_TYPE: rocm + SKIP_ALL_TESTS: 1 + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: rocm6.4 + use_split_build: False + DESIRED_PYTHON: "3.13t" + steps: + - name: Setup ROCm + uses: ./.github/actions/setup-rocm + - uses: actions/download-artifact@v4.1.7 + name: Download Build Artifacts + with: + name: manywheel-py3_13t-rocm6_4 + path: "${{ runner.temp }}/artifacts/" + - name: Checkout PyTorch + uses: actions/checkout@v4 + with: + submodules: recursive + path: pytorch + show-progress: false + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: ROCm set GPU_FLAG + run: | + echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}" + - name: configure aws credentials + id: aws_creds + if: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') }} + uses: aws-actions/configure-aws-credentials@v4 + with: + role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only + aws-region: us-east-1 + role-duration-seconds: 18000 + - name: Calculate docker image + id: calculate-docker-image + uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.8 + with: + docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }} + docker-image-name: manylinux2_28-builder + custom-tag-prefix: rocm6.4 + docker-build-dir: .ci/docker + working-directory: pytorch + - name: Pull Docker image + uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.8 + with: + docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} + - name: Test Pytorch binary + uses: ./pytorch/.github/actions/test-pytorch-binary + env: + DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }} + - name: Teardown ROCm + uses: ./.github/actions/teardown-rocm + manywheel-py3_13t-rocm6_4-upload: # Uploading + if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read + needs: manywheel-py3_13t-rocm6_4-test + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: rocm6.4 + GPU_ARCH_VERSION: 6.4 + GPU_ARCH_TYPE: rocm + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: rocm6.4 + use_split_build: False + DESIRED_PYTHON: "3.13t" + build_name: manywheel-py3_13t-rocm6_4 + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + uses: ./.github/workflows/_binary-upload.yml + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) manywheel-py3_13t-xpu-build: if: ${{ github.repository_owner == 'pytorch' }} uses: ./.github/workflows/_binary-build-linux.yml @@ -3775,14 +5799,23 @@ jobs: # favor of GPU_ARCH_VERSION DESIRED_CUDA: xpu GPU_ARCH_TYPE: xpu +<<<<<<< HEAD DOCKER_IMAGE: pytorch/manylinux2_28-builder:xpu-2.7 DESIRED_DEVTOOLSET: cxx11-abi +======= + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: xpu +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) use_split_build: False DESIRED_PYTHON: "3.13t" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build_name: manywheel-py3_13t-xpu build_environment: linux-binary-manywheel +<<<<<<< HEAD PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.0.4; platform_system == 'Linux' | intel-cmplr-lib-ur==2025.0.4; platform_system == 'Linux' | intel-cmplr-lic-rt==2025.0.4; platform_system == 'Linux' | intel-sycl-rt==2025.0.4; platform_system == 'Linux' | intel-cmplr-lib-rt==2025.0.5; platform_system == 'Windows' | intel-cmplr-lib-ur==2025.0.5; platform_system == 'Windows' | intel-cmplr-lic-rt==2025.0.5; platform_system == 'Windows' | intel-sycl-rt==2025.0.5; platform_system == 'Windows' | tcmlib==1.2.0 | umf==0.9.1 | intel-pti==0.10.1 +======= + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.1.1 | intel-cmplr-lib-ur==2025.1.1 | intel-cmplr-lic-rt==2025.1.1 | intel-sycl-rt==2025.1.1 | oneccl-devel==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.15.0; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.1.0 | onemkl-sycl-dft==2025.1.0 | onemkl-sycl-lapack==2025.1.0 | onemkl-sycl-rng==2025.1.0 | onemkl-sycl-sparse==2025.1.0 | dpcpp-cpp-rt==2025.1.1 | intel-opencl-rt==2025.1.1 | mkl==2025.1.0 | intel-openmp==2025.1.1 | tbb==2022.1.0 | tcmlib==1.3.0 | umf==0.10.0 | intel-pti==0.12.3 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_13t-xpu-test: # Testing @@ -3800,8 +5833,13 @@ jobs: DESIRED_CUDA: xpu GPU_ARCH_TYPE: xpu SKIP_ALL_TESTS: 1 +<<<<<<< HEAD DOCKER_IMAGE: pytorch/manylinux2_28-builder:xpu-2.7 DESIRED_DEVTOOLSET: cxx11-abi +======= + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: xpu +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) use_split_build: False DESIRED_PYTHON: "3.13t" permissions: @@ -3812,7 +5850,11 @@ jobs: uses: ./.github/actions/setup-xpu - name: configure aws credentials id: aws_creds +<<<<<<< HEAD uses: aws-actions/configure-aws-credentials@v1.7.0 +======= + uses: aws-actions/configure-aws-credentials@v4 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only aws-region: us-east-1 @@ -3835,12 +5877,32 @@ jobs: # Remove any artifacts from the previous checkouts git clean -fxd working-directory: pytorch +<<<<<<< HEAD - name: Pull Docker image uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.7 with: docker-image: pytorch/manylinux2_28-builder:xpu-2.7 - name: Test Pytorch binary uses: ./pytorch/.github/actions/test-pytorch-binary +======= + - name: Calculate docker image + id: calculate-docker-image + uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.8 + with: + docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }} + docker-image-name: manylinux2_28-builder + custom-tag-prefix: xpu + docker-build-dir: .ci/docker + working-directory: pytorch + - name: Pull Docker image + uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.8 + with: + docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} + - name: Test Pytorch binary + uses: ./pytorch/.github/actions/test-pytorch-binary + env: + DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }} +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: Teardown XPU uses: ./.github/actions/teardown-xpu manywheel-py3_13t-xpu-upload: # Uploading @@ -3856,8 +5918,13 @@ jobs: # favor of GPU_ARCH_VERSION DESIRED_CUDA: xpu GPU_ARCH_TYPE: xpu +<<<<<<< HEAD DOCKER_IMAGE: pytorch/manylinux2_28-builder:xpu-2.7 DESIRED_DEVTOOLSET: cxx11-abi +======= + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: xpu +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) use_split_build: False DESIRED_PYTHON: "3.13t" build_name: manywheel-py3_13t-xpu diff --git a/.github/workflows/generated-linux-s390x-binary-manywheel-nightly.yml b/.github/workflows/generated-linux-s390x-binary-manywheel-nightly.yml index 4d97845dd9fe..a6e37eaad62e 100644 --- a/.github/workflows/generated-linux-s390x-binary-manywheel-nightly.yml +++ b/.github/workflows/generated-linux-s390x-binary-manywheel-nightly.yml @@ -18,6 +18,12 @@ on: - 'ciflow/binaries_wheel/*' workflow_dispatch: +<<<<<<< HEAD +======= +permissions: + id-token: write + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) env: # Needed for conda builds ALPINE_IMAGE: "docker.io/s390x/alpine" @@ -38,7 +44,11 @@ jobs: get-label-type: if: github.repository_owner == 'pytorch' name: get-label-type +<<<<<<< HEAD uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.7 +======= + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: triggering_actor: ${{ github.triggering_actor }} issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} @@ -55,7 +65,12 @@ jobs: # favor of GPU_ARCH_VERSION DESIRED_CUDA: cpu GPU_ARCH_TYPE: cpu-s390x +<<<<<<< HEAD DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x-2.7 +======= + DOCKER_IMAGE: pytorch/manylinuxs390x-builder + DOCKER_IMAGE_TAG_PREFIX: cpu-s390x +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) use_split_build: False DESIRED_PYTHON: "3.9" runs_on: linux.s390x @@ -78,7 +93,12 @@ jobs: # favor of GPU_ARCH_VERSION DESIRED_CUDA: cpu GPU_ARCH_TYPE: cpu-s390x +<<<<<<< HEAD DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x-2.7 +======= + DOCKER_IMAGE: pytorch/manylinuxs390x-builder + DOCKER_IMAGE_TAG_PREFIX: cpu-s390x +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) use_split_build: False DESIRED_PYTHON: "3.9" build_name: manywheel-py3_9-cpu-s390x @@ -100,7 +120,12 @@ jobs: # favor of GPU_ARCH_VERSION DESIRED_CUDA: cpu GPU_ARCH_TYPE: cpu-s390x +<<<<<<< HEAD DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x-2.7 +======= + DOCKER_IMAGE: pytorch/manylinuxs390x-builder + DOCKER_IMAGE_TAG_PREFIX: cpu-s390x +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) use_split_build: False DESIRED_PYTHON: "3.9" build_name: manywheel-py3_9-cpu-s390x @@ -119,7 +144,12 @@ jobs: # favor of GPU_ARCH_VERSION DESIRED_CUDA: cpu GPU_ARCH_TYPE: cpu-s390x +<<<<<<< HEAD DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x-2.7 +======= + DOCKER_IMAGE: pytorch/manylinuxs390x-builder + DOCKER_IMAGE_TAG_PREFIX: cpu-s390x +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) use_split_build: False DESIRED_PYTHON: "3.10" runs_on: linux.s390x @@ -142,7 +172,12 @@ jobs: # favor of GPU_ARCH_VERSION DESIRED_CUDA: cpu GPU_ARCH_TYPE: cpu-s390x +<<<<<<< HEAD DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x-2.7 +======= + DOCKER_IMAGE: pytorch/manylinuxs390x-builder + DOCKER_IMAGE_TAG_PREFIX: cpu-s390x +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) use_split_build: False DESIRED_PYTHON: "3.10" build_name: manywheel-py3_10-cpu-s390x @@ -164,7 +199,12 @@ jobs: # favor of GPU_ARCH_VERSION DESIRED_CUDA: cpu GPU_ARCH_TYPE: cpu-s390x +<<<<<<< HEAD DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x-2.7 +======= + DOCKER_IMAGE: pytorch/manylinuxs390x-builder + DOCKER_IMAGE_TAG_PREFIX: cpu-s390x +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) use_split_build: False DESIRED_PYTHON: "3.10" build_name: manywheel-py3_10-cpu-s390x @@ -183,7 +223,12 @@ jobs: # favor of GPU_ARCH_VERSION DESIRED_CUDA: cpu GPU_ARCH_TYPE: cpu-s390x +<<<<<<< HEAD DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x-2.7 +======= + DOCKER_IMAGE: pytorch/manylinuxs390x-builder + DOCKER_IMAGE_TAG_PREFIX: cpu-s390x +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) use_split_build: False DESIRED_PYTHON: "3.11" runs_on: linux.s390x @@ -206,7 +251,12 @@ jobs: # favor of GPU_ARCH_VERSION DESIRED_CUDA: cpu GPU_ARCH_TYPE: cpu-s390x +<<<<<<< HEAD DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x-2.7 +======= + DOCKER_IMAGE: pytorch/manylinuxs390x-builder + DOCKER_IMAGE_TAG_PREFIX: cpu-s390x +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) use_split_build: False DESIRED_PYTHON: "3.11" build_name: manywheel-py3_11-cpu-s390x @@ -228,7 +278,12 @@ jobs: # favor of GPU_ARCH_VERSION DESIRED_CUDA: cpu GPU_ARCH_TYPE: cpu-s390x +<<<<<<< HEAD DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x-2.7 +======= + DOCKER_IMAGE: pytorch/manylinuxs390x-builder + DOCKER_IMAGE_TAG_PREFIX: cpu-s390x +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) use_split_build: False DESIRED_PYTHON: "3.11" build_name: manywheel-py3_11-cpu-s390x @@ -247,7 +302,12 @@ jobs: # favor of GPU_ARCH_VERSION DESIRED_CUDA: cpu GPU_ARCH_TYPE: cpu-s390x +<<<<<<< HEAD DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x-2.7 +======= + DOCKER_IMAGE: pytorch/manylinuxs390x-builder + DOCKER_IMAGE_TAG_PREFIX: cpu-s390x +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) use_split_build: False DESIRED_PYTHON: "3.12" runs_on: linux.s390x @@ -270,7 +330,12 @@ jobs: # favor of GPU_ARCH_VERSION DESIRED_CUDA: cpu GPU_ARCH_TYPE: cpu-s390x +<<<<<<< HEAD DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x-2.7 +======= + DOCKER_IMAGE: pytorch/manylinuxs390x-builder + DOCKER_IMAGE_TAG_PREFIX: cpu-s390x +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) use_split_build: False DESIRED_PYTHON: "3.12" build_name: manywheel-py3_12-cpu-s390x @@ -292,7 +357,12 @@ jobs: # favor of GPU_ARCH_VERSION DESIRED_CUDA: cpu GPU_ARCH_TYPE: cpu-s390x +<<<<<<< HEAD DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x-2.7 +======= + DOCKER_IMAGE: pytorch/manylinuxs390x-builder + DOCKER_IMAGE_TAG_PREFIX: cpu-s390x +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) use_split_build: False DESIRED_PYTHON: "3.12" build_name: manywheel-py3_12-cpu-s390x @@ -311,7 +381,12 @@ jobs: # favor of GPU_ARCH_VERSION DESIRED_CUDA: cpu GPU_ARCH_TYPE: cpu-s390x +<<<<<<< HEAD DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x-2.7 +======= + DOCKER_IMAGE: pytorch/manylinuxs390x-builder + DOCKER_IMAGE_TAG_PREFIX: cpu-s390x +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) use_split_build: False DESIRED_PYTHON: "3.13" runs_on: linux.s390x @@ -334,7 +409,12 @@ jobs: # favor of GPU_ARCH_VERSION DESIRED_CUDA: cpu GPU_ARCH_TYPE: cpu-s390x +<<<<<<< HEAD DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x-2.7 +======= + DOCKER_IMAGE: pytorch/manylinuxs390x-builder + DOCKER_IMAGE_TAG_PREFIX: cpu-s390x +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) use_split_build: False DESIRED_PYTHON: "3.13" build_name: manywheel-py3_13-cpu-s390x @@ -356,7 +436,12 @@ jobs: # favor of GPU_ARCH_VERSION DESIRED_CUDA: cpu GPU_ARCH_TYPE: cpu-s390x +<<<<<<< HEAD DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x-2.7 +======= + DOCKER_IMAGE: pytorch/manylinuxs390x-builder + DOCKER_IMAGE_TAG_PREFIX: cpu-s390x +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) use_split_build: False DESIRED_PYTHON: "3.13" build_name: manywheel-py3_13-cpu-s390x diff --git a/.github/workflows/generated-macos-arm64-binary-libtorch-release-nightly.yml b/.github/workflows/generated-macos-arm64-binary-libtorch-release-nightly.yml new file mode 100644 index 000000000000..9f1976c6de04 --- /dev/null +++ b/.github/workflows/generated-macos-arm64-binary-libtorch-release-nightly.yml @@ -0,0 +1,136 @@ +# @generated DO NOT EDIT MANUALLY + +# Template is at: .github/templates/macos_binary_build_workflow.yml.j2 +# Generation script: .github/scripts/generate_ci_workflows.py +name: macos-arm64-binary-libtorch-release + +on: +# TODO: Migrate to new ciflow trigger, reference https://github.com/pytorch/pytorch/pull/70321 + push: + # NOTE: Meta Employees can trigger new nightlies using: https://fburl.com/trigger_pytorch_nightly_build + branches: + - nightly + tags: + # NOTE: Binary build pipelines should only get triggered on release candidate builds + # Release candidate tags look like: v1.11.0-rc1 + - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+ + - 'ciflow/binaries/*' + - 'ciflow/binaries_libtorch/*' + workflow_dispatch: + +env: + ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine" + AWS_DEFAULT_REGION: us-east-1 + BUILD_ENVIRONMENT: macos-arm64-binary-libtorch-release + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + PR_NUMBER: ${{ github.event.pull_request.number }} + SKIP_ALL_TESTS: 0 +concurrency: + group: macos-arm64-binary-libtorch-release-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }} + cancel-in-progress: true + +jobs: + libtorch-cpu-shared-with-deps-release-build: + if: ${{ github.repository_owner == 'pytorch' }} + runs-on: macos-14-xlarge + timeout-minutes: 240 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + SKIP_ALL_TESTS: 1 + LIBTORCH_CONFIG: release + LIBTORCH_VARIANT: shared-with-deps + # This is a dummy value for libtorch to work correctly with our batch scripts + # without this value pip does not get installed for some reason + DESIRED_PYTHON: "3.9" + steps: + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + # shellcheck disable=SC2129 + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + # shellcheck disable=SC2129 + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + # shellcheck disable=SC2129 + echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}" + - name: Install conda and dependencies + run: | + # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on + curl --retry 3 --retry-all-errors -o "${RUNNER_TEMP}/conda.sh" "https://repo.anaconda.com/miniconda/Miniconda3-py310_23.5.2-0-MacOSX-$(uname -m).sh" + chmod +x "${RUNNER_TEMP}/conda.sh" + /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda" + echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}" + if [ -d "/Applications/Xcode_14.3.1.app" ]; then + echo "DEVELOPER_DIR=/Applications/Xcode_14.3.1.app/Contents/Developer" >> "${GITHUB_ENV}" + elif [ -d "/Applications/Xcode_13.3.1.app" ]; then + echo "DEVELOPER_DIR=/Applications/Xcode_13.3.1.app/Contents/Developer" >> "${GITHUB_ENV}" + fi + - name: Checkout PyTorch + uses: actions/checkout@v4 + with: + submodules: recursive + path: pytorch + show-progress: false + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Populate binary env + run: | + # shellcheck disable=SC1091 + source "${RUNNER_TEMP}/anaconda/bin/activate" + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Build PyTorch binary + run: | + # shellcheck disable=SC1091 + source "${RUNNER_TEMP}/anaconda/bin/activate" + set -eux -o pipefail + # shellcheck disable=SC1090 + source "${BINARY_ENV_FILE:-/Users/distiller/project/env}" + mkdir -p "$PYTORCH_FINAL_PACKAGE_DIR" + + # Build + USE_PYTORCH_METAL_EXPORT=1 + USE_COREML_DELEGATE=1 + TORCH_PACKAGE_NAME="${TORCH_PACKAGE_NAME//-/_}" + export USE_PYTORCH_METAL_EXPORT + export USE_COREML_DELEGATE + export TORCH_PACKAGE_NAME + "${PYTORCH_ROOT}/.ci/wheel/build_wheel.sh" + - uses: actions/upload-artifact@v4.4.0 + if: always() + with: + name: libtorch-cpu-shared-with-deps-release + retention-days: 14 + if-no-files-found: error + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + libtorch-cpu-shared-with-deps-release-upload: # Uploading + if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read + needs: libtorch-cpu-shared-with-deps-release-build + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + DOCKER_IMAGE: libtorch-cxx11-builder + DOCKER_IMAGE_TAG_PREFIX: cpu + LIBTORCH_CONFIG: release + LIBTORCH_VARIANT: shared-with-deps + build_name: libtorch-cpu-shared-with-deps-release + use_s3: False + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + uses: ./.github/workflows/_binary-upload.yml diff --git a/.github/workflows/generated-macos-arm64-binary-wheel-nightly.yml b/.github/workflows/generated-macos-arm64-binary-wheel-nightly.yml index d5eb16b786eb..79d110248479 100644 --- a/.github/workflows/generated-macos-arm64-binary-wheel-nightly.yml +++ b/.github/workflows/generated-macos-arm64-binary-wheel-nightly.yml @@ -145,7 +145,12 @@ jobs: # favor of GPU_ARCH_VERSION DESIRED_CUDA: cpu GPU_ARCH_TYPE: cpu +<<<<<<< HEAD DOCKER_IMAGE: pytorch/manylinux2_28-builder:cpu-2.7 +======= + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: cpu +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) DESIRED_PYTHON: "3.9" build_name: wheel-py3_9-cpu use_s3: False @@ -267,7 +272,12 @@ jobs: # favor of GPU_ARCH_VERSION DESIRED_CUDA: cpu GPU_ARCH_TYPE: cpu +<<<<<<< HEAD DOCKER_IMAGE: pytorch/manylinux2_28-builder:cpu-2.7 +======= + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: cpu +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) DESIRED_PYTHON: "3.10" build_name: wheel-py3_10-cpu use_s3: False @@ -389,7 +399,12 @@ jobs: # favor of GPU_ARCH_VERSION DESIRED_CUDA: cpu GPU_ARCH_TYPE: cpu +<<<<<<< HEAD DOCKER_IMAGE: pytorch/manylinux2_28-builder:cpu-2.7 +======= + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: cpu +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) DESIRED_PYTHON: "3.11" build_name: wheel-py3_11-cpu use_s3: False @@ -511,7 +526,12 @@ jobs: # favor of GPU_ARCH_VERSION DESIRED_CUDA: cpu GPU_ARCH_TYPE: cpu +<<<<<<< HEAD DOCKER_IMAGE: pytorch/manylinux2_28-builder:cpu-2.7 +======= + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: cpu +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) DESIRED_PYTHON: "3.12" build_name: wheel-py3_12-cpu use_s3: False @@ -633,7 +653,12 @@ jobs: # favor of GPU_ARCH_VERSION DESIRED_CUDA: cpu GPU_ARCH_TYPE: cpu +<<<<<<< HEAD DOCKER_IMAGE: pytorch/manylinux2_28-builder:cpu-2.7 +======= + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: cpu +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) DESIRED_PYTHON: "3.13" build_name: wheel-py3_13-cpu use_s3: False @@ -755,7 +780,12 @@ jobs: # favor of GPU_ARCH_VERSION DESIRED_CUDA: cpu GPU_ARCH_TYPE: cpu +<<<<<<< HEAD DOCKER_IMAGE: pytorch/manylinux2_28-builder:cpu-2.7 +======= + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: cpu +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) DESIRED_PYTHON: "3.13t" build_name: wheel-py3_13t-cpu use_s3: False diff --git a/.github/workflows/generated-windows-arm64-binary-libtorch-debug-nightly.yml b/.github/workflows/generated-windows-arm64-binary-libtorch-debug-nightly.yml index 1c9888286ab1..7aa7fa626216 100644 --- a/.github/workflows/generated-windows-arm64-binary-libtorch-debug-nightly.yml +++ b/.github/workflows/generated-windows-arm64-binary-libtorch-debug-nightly.yml @@ -1,11 +1,19 @@ # @generated DO NOT EDIT MANUALLY +<<<<<<< HEAD # Template is at: .github/templates/windows_arm64_binary_build_workflow.yml.j2 +======= +# Template is at: .github/templates/windows_binary_build_workflow.yml.j2 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # Generation script: .github/scripts/generate_ci_workflows.py name: windows-arm64-binary-libtorch-debug on: push: +<<<<<<< HEAD +======= + # NOTE: Meta Employees can trigger new nightlies using: https://fburl.com/trigger_pytorch_nightly_build +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) branches: - nightly tags: @@ -17,24 +25,44 @@ on: workflow_dispatch: env: +<<<<<<< HEAD +======= + # Needed for conda builds + ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine" + AWS_DEFAULT_REGION: us-east-1 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) BUILD_ENVIRONMENT: windows-arm64-binary-libtorch-debug GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} PR_NUMBER: ${{ github.event.pull_request.number }} SHA1: ${{ github.event.pull_request.head.sha || github.sha }} SKIP_ALL_TESTS: 1 +<<<<<<< HEAD +======= + OS: windows-arm64 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) PYTORCH_ROOT: /pytorch DOWNLOADS_DIR: c:\temp\downloads DEPENDENCIES_DIR: c:\temp\dependencies ENABLE_APL: 1 ENABLE_OPENBLAS: 0 MSVC_VERSION : 14.42 +<<<<<<< HEAD AWS_DEFAULT_REGION: us-east-1 +======= +concurrency: + group: windows-arm64-binary-libtorch-debug-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }} + cancel-in-progress: true +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) jobs: get-label-type: if: github.repository_owner == 'pytorch' name: get-label-type +<<<<<<< HEAD uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.7 +======= + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: triggering_actor: ${{ github.triggering_actor }} issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} @@ -43,8 +71,13 @@ jobs: libtorch-cpu-shared-with-deps-debug-build: if: ${{ github.repository_owner == 'pytorch' }} needs: get-label-type +<<<<<<< HEAD runs-on: "windows-11-arm64" timeout-minutes: 240 +======= + runs-on: "windows-11-arm64-preview" + timeout-minutes: 300 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: libtorch @@ -59,9 +92,12 @@ jobs: # without this value pip does not get installed for some reason DESIRED_PYTHON: "3.9" steps: +<<<<<<< HEAD # NOTE: These environment variables are put here so that they can be applied on every job equally # They are also here because setting them at a workflow level doesn't give us access to the # runner.temp variable, which we need. +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: Populate binary env shell: cmd run: | @@ -73,10 +109,19 @@ jobs: run: | mkdir "%NIGHTLIES_PYTORCH_ROOT%" mkdir "%PYTORCH_FINAL_PACKAGE_DIR%" +<<<<<<< HEAD +======= + - name: Enable long paths + shell: cmd + run: | + git config --system --get core.longpaths || echo "core.longpaths is not set, setting it now" + git config --system core.longpaths true +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: Git checkout PyTorch uses: actions/checkout@v4 with: path: "pytorch" +<<<<<<< HEAD - name: Bootstrap Build Tools shell: cmd run: | @@ -93,6 +138,8 @@ jobs: uses: actions/checkout@v4 with: path: "pytorch" +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) submodules: recursive - name: Bootstrap Python shell: cmd @@ -117,11 +164,19 @@ jobs: - name: Populate binary env shell: bash run: | +<<<<<<< HEAD "pytorch/.circleci/scripts/binary_populate_env.sh" - name: Build PyTorch binary shell: bash run: | "pytorch/.circleci/scripts/binary_windows_arm64_build.sh" +======= + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Build PyTorch binary + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - uses: actions/upload-artifact@v4.4.0 if: always() with: @@ -134,8 +189,13 @@ jobs: needs: - libtorch-cpu-shared-with-deps-debug-build - get-label-type +<<<<<<< HEAD runs-on: "windows-11-arm64" timeout-minutes: 240 +======= + runs-on: "windows-11-arm64-preview" + timeout-minutes: 300 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: libtorch @@ -150,15 +210,19 @@ jobs: # without this value pip does not get installed for some reason DESIRED_PYTHON: "3.9" steps: +<<<<<<< HEAD # NOTE: These environment variables are put here so that they can be applied on every job equally # They are also here because setting them at a workflow level doesn't give us access to the # runner.temp variable, which we need. +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: Populate binary env shell: cmd run: | echo BINARY_ENV_FILE=%RUNNER_TEMP%/env>> %GITHUB_ENV% echo PYTORCH_FINAL_PACKAGE_DIR=%RUNNER_TEMP%/artifacts>> %GITHUB_ENV% echo WIN_PACKAGE_WORK_DIR=%RUNNER_TEMP%>> %GITHUB_ENV% +<<<<<<< HEAD - uses: actions/download-artifact@v4.1.7 name: Download Build Artifacts with: @@ -176,6 +240,13 @@ jobs: shell: cmd run: | rmdir /s /q "pytorch" +======= + - name: Enable long paths + shell: cmd + run: | + git config --system --get core.longpaths || echo "core.longpaths is not set, setting it now" + git config --system core.longpaths true +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: Git checkout PyTorch uses: actions/checkout@v4 with: @@ -189,14 +260,18 @@ jobs: shell: cmd run: | "pytorch/.ci/pytorch/windows/arm64/bootstrap_python.bat" +<<<<<<< HEAD - name: Bootstrap Build Tools shell: cmd run: | "pytorch/.ci/pytorch/windows/arm64/bootstrap_buildtools.bat" +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: Bootstrap Rust shell: cmd run: | "pytorch/.ci/pytorch/windows/arm64/bootstrap_rust.bat" +<<<<<<< HEAD - name: Populate binary env shell: bash run: | @@ -205,6 +280,21 @@ jobs: shell: bash run: | "pytorch/.circleci/scripts/binary_windows_arm64_test.sh" +======= + - uses: actions/download-artifact@v4.1.7 + name: Download Build Artifacts + with: + name: libtorch-cpu-shared-with-deps-debug + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + - name: Populate binary env + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Test PyTorch binary + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) libtorch-cpu-shared-with-deps-debug-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} permissions: diff --git a/.github/workflows/generated-windows-arm64-binary-libtorch-release-nightly.yml b/.github/workflows/generated-windows-arm64-binary-libtorch-release-nightly.yml index 68600ac7ab9c..0ac01b390fee 100644 --- a/.github/workflows/generated-windows-arm64-binary-libtorch-release-nightly.yml +++ b/.github/workflows/generated-windows-arm64-binary-libtorch-release-nightly.yml @@ -1,11 +1,19 @@ # @generated DO NOT EDIT MANUALLY +<<<<<<< HEAD # Template is at: .github/templates/windows_arm64_binary_build_workflow.yml.j2 +======= +# Template is at: .github/templates/windows_binary_build_workflow.yml.j2 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # Generation script: .github/scripts/generate_ci_workflows.py name: windows-arm64-binary-libtorch-release on: push: +<<<<<<< HEAD +======= + # NOTE: Meta Employees can trigger new nightlies using: https://fburl.com/trigger_pytorch_nightly_build +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) branches: - nightly tags: @@ -17,24 +25,44 @@ on: workflow_dispatch: env: +<<<<<<< HEAD +======= + # Needed for conda builds + ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine" + AWS_DEFAULT_REGION: us-east-1 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) BUILD_ENVIRONMENT: windows-arm64-binary-libtorch-release GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} PR_NUMBER: ${{ github.event.pull_request.number }} SHA1: ${{ github.event.pull_request.head.sha || github.sha }} SKIP_ALL_TESTS: 1 +<<<<<<< HEAD +======= + OS: windows-arm64 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) PYTORCH_ROOT: /pytorch DOWNLOADS_DIR: c:\temp\downloads DEPENDENCIES_DIR: c:\temp\dependencies ENABLE_APL: 1 ENABLE_OPENBLAS: 0 MSVC_VERSION : 14.42 +<<<<<<< HEAD AWS_DEFAULT_REGION: us-east-1 +======= +concurrency: + group: windows-arm64-binary-libtorch-release-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }} + cancel-in-progress: true +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) jobs: get-label-type: if: github.repository_owner == 'pytorch' name: get-label-type +<<<<<<< HEAD uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.7 +======= + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: triggering_actor: ${{ github.triggering_actor }} issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} @@ -43,8 +71,13 @@ jobs: libtorch-cpu-shared-with-deps-release-build: if: ${{ github.repository_owner == 'pytorch' }} needs: get-label-type +<<<<<<< HEAD runs-on: "windows-11-arm64" timeout-minutes: 240 +======= + runs-on: "windows-11-arm64-preview" + timeout-minutes: 300 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: libtorch @@ -59,9 +92,12 @@ jobs: # without this value pip does not get installed for some reason DESIRED_PYTHON: "3.9" steps: +<<<<<<< HEAD # NOTE: These environment variables are put here so that they can be applied on every job equally # They are also here because setting them at a workflow level doesn't give us access to the # runner.temp variable, which we need. +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: Populate binary env shell: cmd run: | @@ -73,10 +109,19 @@ jobs: run: | mkdir "%NIGHTLIES_PYTORCH_ROOT%" mkdir "%PYTORCH_FINAL_PACKAGE_DIR%" +<<<<<<< HEAD +======= + - name: Enable long paths + shell: cmd + run: | + git config --system --get core.longpaths || echo "core.longpaths is not set, setting it now" + git config --system core.longpaths true +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: Git checkout PyTorch uses: actions/checkout@v4 with: path: "pytorch" +<<<<<<< HEAD - name: Bootstrap Build Tools shell: cmd run: | @@ -93,6 +138,8 @@ jobs: uses: actions/checkout@v4 with: path: "pytorch" +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) submodules: recursive - name: Bootstrap Python shell: cmd @@ -117,11 +164,19 @@ jobs: - name: Populate binary env shell: bash run: | +<<<<<<< HEAD "pytorch/.circleci/scripts/binary_populate_env.sh" - name: Build PyTorch binary shell: bash run: | "pytorch/.circleci/scripts/binary_windows_arm64_build.sh" +======= + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Build PyTorch binary + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - uses: actions/upload-artifact@v4.4.0 if: always() with: @@ -134,8 +189,13 @@ jobs: needs: - libtorch-cpu-shared-with-deps-release-build - get-label-type +<<<<<<< HEAD runs-on: "windows-11-arm64" timeout-minutes: 240 +======= + runs-on: "windows-11-arm64-preview" + timeout-minutes: 300 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: libtorch @@ -150,15 +210,19 @@ jobs: # without this value pip does not get installed for some reason DESIRED_PYTHON: "3.9" steps: +<<<<<<< HEAD # NOTE: These environment variables are put here so that they can be applied on every job equally # They are also here because setting them at a workflow level doesn't give us access to the # runner.temp variable, which we need. +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: Populate binary env shell: cmd run: | echo BINARY_ENV_FILE=%RUNNER_TEMP%/env>> %GITHUB_ENV% echo PYTORCH_FINAL_PACKAGE_DIR=%RUNNER_TEMP%/artifacts>> %GITHUB_ENV% echo WIN_PACKAGE_WORK_DIR=%RUNNER_TEMP%>> %GITHUB_ENV% +<<<<<<< HEAD - uses: actions/download-artifact@v4.1.7 name: Download Build Artifacts with: @@ -176,6 +240,13 @@ jobs: shell: cmd run: | rmdir /s /q "pytorch" +======= + - name: Enable long paths + shell: cmd + run: | + git config --system --get core.longpaths || echo "core.longpaths is not set, setting it now" + git config --system core.longpaths true +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: Git checkout PyTorch uses: actions/checkout@v4 with: @@ -189,14 +260,18 @@ jobs: shell: cmd run: | "pytorch/.ci/pytorch/windows/arm64/bootstrap_python.bat" +<<<<<<< HEAD - name: Bootstrap Build Tools shell: cmd run: | "pytorch/.ci/pytorch/windows/arm64/bootstrap_buildtools.bat" +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: Bootstrap Rust shell: cmd run: | "pytorch/.ci/pytorch/windows/arm64/bootstrap_rust.bat" +<<<<<<< HEAD - name: Populate binary env shell: bash run: | @@ -205,6 +280,21 @@ jobs: shell: bash run: | "pytorch/.circleci/scripts/binary_windows_arm64_test.sh" +======= + - uses: actions/download-artifact@v4.1.7 + name: Download Build Artifacts + with: + name: libtorch-cpu-shared-with-deps-release + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + - name: Populate binary env + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Test PyTorch binary + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) libtorch-cpu-shared-with-deps-release-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} permissions: diff --git a/.github/workflows/generated-windows-arm64-binary-wheel-nightly.yml b/.github/workflows/generated-windows-arm64-binary-wheel-nightly.yml index 1b6373276f5e..30fef7f4e2e6 100644 --- a/.github/workflows/generated-windows-arm64-binary-wheel-nightly.yml +++ b/.github/workflows/generated-windows-arm64-binary-wheel-nightly.yml @@ -1,11 +1,19 @@ # @generated DO NOT EDIT MANUALLY +<<<<<<< HEAD # Template is at: .github/templates/windows_arm64_binary_build_workflow.yml.j2 +======= +# Template is at: .github/templates/windows_binary_build_workflow.yml.j2 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # Generation script: .github/scripts/generate_ci_workflows.py name: windows-arm64-binary-wheel on: push: +<<<<<<< HEAD +======= + # NOTE: Meta Employees can trigger new nightlies using: https://fburl.com/trigger_pytorch_nightly_build +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) branches: - nightly tags: @@ -17,34 +25,62 @@ on: workflow_dispatch: env: +<<<<<<< HEAD +======= + # Needed for conda builds + ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine" + AWS_DEFAULT_REGION: us-east-1 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) BUILD_ENVIRONMENT: windows-arm64-binary-wheel GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} PR_NUMBER: ${{ github.event.pull_request.number }} SHA1: ${{ github.event.pull_request.head.sha || github.sha }} SKIP_ALL_TESTS: 1 +<<<<<<< HEAD +======= + OS: windows-arm64 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) PYTORCH_ROOT: /pytorch DOWNLOADS_DIR: c:\temp\downloads DEPENDENCIES_DIR: c:\temp\dependencies ENABLE_APL: 1 ENABLE_OPENBLAS: 0 MSVC_VERSION : 14.42 +<<<<<<< HEAD AWS_DEFAULT_REGION: us-east-1 +======= +concurrency: + group: windows-arm64-binary-wheel-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }} + cancel-in-progress: true +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) jobs: get-label-type: if: github.repository_owner == 'pytorch' name: get-label-type +<<<<<<< HEAD uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.7 +======= + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: triggering_actor: ${{ github.triggering_actor }} issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} curr_branch: ${{ github.head_ref || github.ref_name }} curr_ref_type: ${{ github.ref_type }} +<<<<<<< HEAD wheel-py3_12-cpu-build: if: ${{ github.repository_owner == 'pytorch' }} needs: get-label-type runs-on: "windows-11-arm64" timeout-minutes: 240 +======= + wheel-py3_11-cpu-build: + if: ${{ github.repository_owner == 'pytorch' }} + needs: get-label-type + runs-on: "windows-11-arm64-preview" + timeout-minutes: 300 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel @@ -53,11 +89,16 @@ jobs: DESIRED_CUDA: cpu GPU_ARCH_TYPE: cpu SKIP_ALL_TESTS: 1 +<<<<<<< HEAD DESIRED_PYTHON: "3.12" steps: # NOTE: These environment variables are put here so that they can be applied on every job equally # They are also here because setting them at a workflow level doesn't give us access to the # runner.temp variable, which we need. +======= + DESIRED_PYTHON: "3.11" + steps: +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: Populate binary env shell: cmd run: | @@ -69,10 +110,19 @@ jobs: run: | mkdir "%NIGHTLIES_PYTORCH_ROOT%" mkdir "%PYTORCH_FINAL_PACKAGE_DIR%" +<<<<<<< HEAD +======= + - name: Enable long paths + shell: cmd + run: | + git config --system --get core.longpaths || echo "core.longpaths is not set, setting it now" + git config --system core.longpaths true +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: Git checkout PyTorch uses: actions/checkout@v4 with: path: "pytorch" +<<<<<<< HEAD - name: Bootstrap Build Tools shell: cmd run: | @@ -89,6 +139,8 @@ jobs: uses: actions/checkout@v4 with: path: "pytorch" +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) submodules: recursive - name: Bootstrap Python shell: cmd @@ -113,6 +165,7 @@ jobs: - name: Populate binary env shell: bash run: | +<<<<<<< HEAD "pytorch/.circleci/scripts/binary_populate_env.sh" - name: Build PyTorch binary shell: bash @@ -132,6 +185,27 @@ jobs: - get-label-type runs-on: "windows-11-arm64" timeout-minutes: 240 +======= + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Build PyTorch binary + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh" + - uses: actions/upload-artifact@v4.4.0 + if: always() + with: + name: wheel-py3_11-cpu + retention-days: 14 + if-no-files-found: error + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + wheel-py3_11-cpu-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: + - wheel-py3_11-cpu-build + - get-label-type + runs-on: "windows-11-arm64-preview" + timeout-minutes: 300 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel @@ -140,17 +214,23 @@ jobs: DESIRED_CUDA: cpu GPU_ARCH_TYPE: cpu SKIP_ALL_TESTS: 1 +<<<<<<< HEAD DESIRED_PYTHON: "3.12" steps: # NOTE: These environment variables are put here so that they can be applied on every job equally # They are also here because setting them at a workflow level doesn't give us access to the # runner.temp variable, which we need. +======= + DESIRED_PYTHON: "3.11" + steps: +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: Populate binary env shell: cmd run: | echo BINARY_ENV_FILE=%RUNNER_TEMP%/env>> %GITHUB_ENV% echo PYTORCH_FINAL_PACKAGE_DIR=%RUNNER_TEMP%/artifacts>> %GITHUB_ENV% echo WIN_PACKAGE_WORK_DIR=%RUNNER_TEMP%>> %GITHUB_ENV% +<<<<<<< HEAD - uses: actions/download-artifact@v4.1.7 name: Download Build Artifacts with: @@ -168,6 +248,13 @@ jobs: shell: cmd run: | rmdir /s /q "pytorch" +======= + - name: Enable long paths + shell: cmd + run: | + git config --system --get core.longpaths || echo "core.longpaths is not set, setting it now" + git config --system core.longpaths true +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: Git checkout PyTorch uses: actions/checkout@v4 with: @@ -181,14 +268,18 @@ jobs: shell: cmd run: | "pytorch/.ci/pytorch/windows/arm64/bootstrap_python.bat" +<<<<<<< HEAD - name: Bootstrap Build Tools shell: cmd run: | "pytorch/.ci/pytorch/windows/arm64/bootstrap_buildtools.bat" +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: Bootstrap Rust shell: cmd run: | "pytorch/.ci/pytorch/windows/arm64/bootstrap_rust.bat" +<<<<<<< HEAD - name: Populate binary env shell: bash run: | @@ -197,6 +288,168 @@ jobs: shell: bash run: | "pytorch/.circleci/scripts/binary_windows_arm64_test.sh" +======= + - uses: actions/download-artifact@v4.1.7 + name: Download Build Artifacts + with: + name: wheel-py3_11-cpu + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + - name: Populate binary env + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Test PyTorch binary + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh" + wheel-py3_11-cpu-upload: # Uploading + if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read + needs: wheel-py3_11-cpu-test + with: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + PACKAGE_TYPE: wheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + DESIRED_PYTHON: "3.11" + build_name: wheel-py3_11-cpu + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + uses: ./.github/workflows/_binary-upload.yml + wheel-py3_12-cpu-build: + if: ${{ github.repository_owner == 'pytorch' }} + needs: get-label-type + runs-on: "windows-11-arm64-preview" + timeout-minutes: 300 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + PACKAGE_TYPE: wheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.12" + steps: + - name: Populate binary env + shell: cmd + run: | + echo BINARY_ENV_FILE=%RUNNER_TEMP%/env>> %GITHUB_ENV% + echo PYTORCH_FINAL_PACKAGE_DIR=%RUNNER_TEMP%/artifacts>> %GITHUB_ENV% + echo WIN_PACKAGE_WORK_DIR=%RUNNER_TEMP%>> %GITHUB_ENV% + - name: Bootstrap folders + shell: cmd + run: | + mkdir "%NIGHTLIES_PYTORCH_ROOT%" + mkdir "%PYTORCH_FINAL_PACKAGE_DIR%" + - name: Enable long paths + shell: cmd + run: | + git config --system --get core.longpaths || echo "core.longpaths is not set, setting it now" + git config --system core.longpaths true + - name: Git checkout PyTorch + uses: actions/checkout@v4 + with: + path: "pytorch" + submodules: recursive + - name: Bootstrap Python + shell: cmd + run: | + "pytorch/.ci/pytorch/windows/arm64/bootstrap_python.bat" + - name: Bootstrap APL + shell: cmd + run: | + "pytorch/.ci/pytorch/windows/arm64/bootstrap_apl.bat" + - name: Bootstrap Rust + shell: cmd + run: | + "pytorch/.ci/pytorch/windows/arm64/bootstrap_rust.bat" + - name: Bootstrap sccache + shell: cmd + run: | + "pytorch/.ci/pytorch/windows/arm64/bootstrap_sccache.bat" + - name: Bootstrap Libuv + shell: cmd + run: | + "pytorch/.ci/pytorch/windows/arm64/bootstrap_libuv.bat" + - name: Populate binary env + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Build PyTorch binary + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh" + - uses: actions/upload-artifact@v4.4.0 + if: always() + with: + name: wheel-py3_12-cpu + retention-days: 14 + if-no-files-found: error + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + wheel-py3_12-cpu-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: + - wheel-py3_12-cpu-build + - get-label-type + runs-on: "windows-11-arm64-preview" + timeout-minutes: 300 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + PACKAGE_TYPE: wheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.12" + steps: + - name: Populate binary env + shell: cmd + run: | + echo BINARY_ENV_FILE=%RUNNER_TEMP%/env>> %GITHUB_ENV% + echo PYTORCH_FINAL_PACKAGE_DIR=%RUNNER_TEMP%/artifacts>> %GITHUB_ENV% + echo WIN_PACKAGE_WORK_DIR=%RUNNER_TEMP%>> %GITHUB_ENV% + - name: Enable long paths + shell: cmd + run: | + git config --system --get core.longpaths || echo "core.longpaths is not set, setting it now" + git config --system core.longpaths true + - name: Git checkout PyTorch + uses: actions/checkout@v4 + with: + path: "pytorch" + submodules: recursive + - name: Bootstrap APL + shell: cmd + run: | + "pytorch/.ci/pytorch/windows/arm64/bootstrap_apl.bat" + - name: Bootstrap Python + shell: cmd + run: | + "pytorch/.ci/pytorch/windows/arm64/bootstrap_python.bat" + - name: Bootstrap Rust + shell: cmd + run: | + "pytorch/.ci/pytorch/windows/arm64/bootstrap_rust.bat" + - uses: actions/download-artifact@v4.1.7 + name: Download Build Artifacts + with: + name: wheel-py3_12-cpu + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + - name: Populate binary env + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Test PyTorch binary + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) wheel-py3_12-cpu-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} permissions: @@ -215,3 +468,153 @@ jobs: secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml +<<<<<<< HEAD +======= + wheel-py3_13-cpu-build: + if: ${{ github.repository_owner == 'pytorch' }} + needs: get-label-type + runs-on: "windows-11-arm64-preview" + timeout-minutes: 300 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + PACKAGE_TYPE: wheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.13" + steps: + - name: Populate binary env + shell: cmd + run: | + echo BINARY_ENV_FILE=%RUNNER_TEMP%/env>> %GITHUB_ENV% + echo PYTORCH_FINAL_PACKAGE_DIR=%RUNNER_TEMP%/artifacts>> %GITHUB_ENV% + echo WIN_PACKAGE_WORK_DIR=%RUNNER_TEMP%>> %GITHUB_ENV% + - name: Bootstrap folders + shell: cmd + run: | + mkdir "%NIGHTLIES_PYTORCH_ROOT%" + mkdir "%PYTORCH_FINAL_PACKAGE_DIR%" + - name: Enable long paths + shell: cmd + run: | + git config --system --get core.longpaths || echo "core.longpaths is not set, setting it now" + git config --system core.longpaths true + - name: Git checkout PyTorch + uses: actions/checkout@v4 + with: + path: "pytorch" + submodules: recursive + - name: Bootstrap Python + shell: cmd + run: | + "pytorch/.ci/pytorch/windows/arm64/bootstrap_python.bat" + - name: Bootstrap APL + shell: cmd + run: | + "pytorch/.ci/pytorch/windows/arm64/bootstrap_apl.bat" + - name: Bootstrap Rust + shell: cmd + run: | + "pytorch/.ci/pytorch/windows/arm64/bootstrap_rust.bat" + - name: Bootstrap sccache + shell: cmd + run: | + "pytorch/.ci/pytorch/windows/arm64/bootstrap_sccache.bat" + - name: Bootstrap Libuv + shell: cmd + run: | + "pytorch/.ci/pytorch/windows/arm64/bootstrap_libuv.bat" + - name: Populate binary env + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Build PyTorch binary + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh" + - uses: actions/upload-artifact@v4.4.0 + if: always() + with: + name: wheel-py3_13-cpu + retention-days: 14 + if-no-files-found: error + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + wheel-py3_13-cpu-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: + - wheel-py3_13-cpu-build + - get-label-type + runs-on: "windows-11-arm64-preview" + timeout-minutes: 300 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + PACKAGE_TYPE: wheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.13" + steps: + - name: Populate binary env + shell: cmd + run: | + echo BINARY_ENV_FILE=%RUNNER_TEMP%/env>> %GITHUB_ENV% + echo PYTORCH_FINAL_PACKAGE_DIR=%RUNNER_TEMP%/artifacts>> %GITHUB_ENV% + echo WIN_PACKAGE_WORK_DIR=%RUNNER_TEMP%>> %GITHUB_ENV% + - name: Enable long paths + shell: cmd + run: | + git config --system --get core.longpaths || echo "core.longpaths is not set, setting it now" + git config --system core.longpaths true + - name: Git checkout PyTorch + uses: actions/checkout@v4 + with: + path: "pytorch" + submodules: recursive + - name: Bootstrap APL + shell: cmd + run: | + "pytorch/.ci/pytorch/windows/arm64/bootstrap_apl.bat" + - name: Bootstrap Python + shell: cmd + run: | + "pytorch/.ci/pytorch/windows/arm64/bootstrap_python.bat" + - name: Bootstrap Rust + shell: cmd + run: | + "pytorch/.ci/pytorch/windows/arm64/bootstrap_rust.bat" + - uses: actions/download-artifact@v4.1.7 + name: Download Build Artifacts + with: + name: wheel-py3_13-cpu + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + - name: Populate binary env + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Test PyTorch binary + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh" + wheel-py3_13-cpu-upload: # Uploading + if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read + needs: wheel-py3_13-cpu-test + with: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + PACKAGE_TYPE: wheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + DESIRED_PYTHON: "3.13" + build_name: wheel-py3_13-cpu + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + uses: ./.github/workflows/_binary-upload.yml +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) diff --git a/.github/workflows/generated-windows-binary-libtorch-debug-main.yml b/.github/workflows/generated-windows-binary-libtorch-debug-main.yml index 98accb3deec9..54f0687ab6ff 100644 --- a/.github/workflows/generated-windows-binary-libtorch-debug-main.yml +++ b/.github/workflows/generated-windows-binary-libtorch-debug-main.yml @@ -19,6 +19,10 @@ env: PR_NUMBER: ${{ github.event.pull_request.number }} SHA1: ${{ github.event.pull_request.head.sha || github.sha }} SKIP_ALL_TESTS: 1 +<<<<<<< HEAD +======= + OS: windows +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) concurrency: group: windows-binary-libtorch-debug-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }} cancel-in-progress: true @@ -27,7 +31,11 @@ jobs: get-label-type: if: github.repository_owner == 'pytorch' name: get-label-type +<<<<<<< HEAD uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.7 +======= + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: triggering_actor: ${{ github.triggering_actor }} issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} @@ -52,6 +60,18 @@ jobs: # without this value pip does not get installed for some reason DESIRED_PYTHON: "3.9" steps: +<<<<<<< HEAD +======= + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: Display EC2 information shell: bash run: | @@ -67,7 +87,11 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7 +======= + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -96,6 +120,7 @@ jobs: # Let's both exclude the path and disable Windows Defender completely just to be sure # that it doesn't interfere Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore +<<<<<<< HEAD # NOTE: These environment variables are put here so that they can be applied on every job equally # They are also here because setting them at a workflow level doesn't give us access to the # runner.temp variable, which we need. @@ -105,6 +130,8 @@ jobs: echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: Checkout PyTorch uses: actions/checkout@v4 with: @@ -144,6 +171,10 @@ jobs: if: always() run: | .github\scripts\kill_active_ssh_sessions.ps1 +<<<<<<< HEAD +======= + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) libtorch-cpu-shared-with-deps-debug-test: # Testing if: ${{ github.repository_owner == 'pytorch' }} needs: @@ -180,7 +211,11 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7 +======= + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -209,6 +244,20 @@ jobs: # Let's both exclude the path and disable Windows Defender completely just to be sure # that it doesn't interfere Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore +<<<<<<< HEAD +======= + - name: Checkout PyTorch + uses: actions/checkout@v4 + with: + submodules: recursive + path: pytorch + show-progress: false + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # NOTE: These environment variables are put here so that they can be applied on every job equally # They are also here because setting them at a workflow level doesn't give us access to the # runner.temp variable, which we need. @@ -223,6 +272,7 @@ jobs: with: name: libtorch-cpu-shared-with-deps-debug path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" +<<<<<<< HEAD - name: Checkout PyTorch uses: actions/checkout@v4 with: @@ -234,6 +284,8 @@ jobs: # Remove any artifacts from the previous checkouts git clean -fxd working-directory: pytorch +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: Populate binary env shell: bash run: | diff --git a/.github/workflows/generated-windows-binary-libtorch-debug-nightly.yml b/.github/workflows/generated-windows-binary-libtorch-debug-nightly.yml index 5f02c2636e10..db1283d97318 100644 --- a/.github/workflows/generated-windows-binary-libtorch-debug-nightly.yml +++ b/.github/workflows/generated-windows-binary-libtorch-debug-nightly.yml @@ -26,6 +26,10 @@ env: PR_NUMBER: ${{ github.event.pull_request.number }} SHA1: ${{ github.event.pull_request.head.sha || github.sha }} SKIP_ALL_TESTS: 1 +<<<<<<< HEAD +======= + OS: windows +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) concurrency: group: windows-binary-libtorch-debug-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }} cancel-in-progress: true @@ -34,7 +38,11 @@ jobs: get-label-type: if: github.repository_owner == 'pytorch' name: get-label-type +<<<<<<< HEAD uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.7 +======= + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: triggering_actor: ${{ github.triggering_actor }} issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} @@ -59,6 +67,18 @@ jobs: # without this value pip does not get installed for some reason DESIRED_PYTHON: "3.9" steps: +<<<<<<< HEAD +======= + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: Display EC2 information shell: bash run: | @@ -74,7 +94,11 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7 +======= + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -103,6 +127,7 @@ jobs: # Let's both exclude the path and disable Windows Defender completely just to be sure # that it doesn't interfere Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore +<<<<<<< HEAD # NOTE: These environment variables are put here so that they can be applied on every job equally # They are also here because setting them at a workflow level doesn't give us access to the # runner.temp variable, which we need. @@ -112,6 +137,8 @@ jobs: echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: Checkout PyTorch uses: actions/checkout@v4 with: @@ -151,6 +178,10 @@ jobs: if: always() run: | .github\scripts\kill_active_ssh_sessions.ps1 +<<<<<<< HEAD +======= + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) libtorch-cpu-shared-with-deps-debug-test: # Testing if: ${{ github.repository_owner == 'pytorch' }} needs: @@ -187,7 +218,11 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7 +======= + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -216,6 +251,20 @@ jobs: # Let's both exclude the path and disable Windows Defender completely just to be sure # that it doesn't interfere Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore +<<<<<<< HEAD +======= + - name: Checkout PyTorch + uses: actions/checkout@v4 + with: + submodules: recursive + path: pytorch + show-progress: false + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # NOTE: These environment variables are put here so that they can be applied on every job equally # They are also here because setting them at a workflow level doesn't give us access to the # runner.temp variable, which we need. @@ -230,6 +279,7 @@ jobs: with: name: libtorch-cpu-shared-with-deps-debug path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" +<<<<<<< HEAD - name: Checkout PyTorch uses: actions/checkout@v4 with: @@ -241,6 +291,8 @@ jobs: # Remove any artifacts from the previous checkouts git clean -fxd working-directory: pytorch +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: Populate binary env shell: bash run: | @@ -284,6 +336,7 @@ jobs: secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml +<<<<<<< HEAD libtorch-cuda11_8-shared-with-deps-debug-build: if: ${{ github.repository_owner == 'pytorch' }} needs: get-label-type @@ -531,6 +584,8 @@ jobs: secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) libtorch-cuda12_6-shared-with-deps-debug-build: if: ${{ github.repository_owner == 'pytorch' }} needs: get-label-type @@ -551,6 +606,18 @@ jobs: # without this value pip does not get installed for some reason DESIRED_PYTHON: "3.9" steps: +<<<<<<< HEAD +======= + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: Display EC2 information shell: bash run: | @@ -566,7 +633,11 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7 +======= + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -595,6 +666,7 @@ jobs: # Let's both exclude the path and disable Windows Defender completely just to be sure # that it doesn't interfere Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore +<<<<<<< HEAD # NOTE: These environment variables are put here so that they can be applied on every job equally # They are also here because setting them at a workflow level doesn't give us access to the # runner.temp variable, which we need. @@ -604,6 +676,8 @@ jobs: echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: Checkout PyTorch uses: actions/checkout@v4 with: @@ -643,6 +717,10 @@ jobs: if: always() run: | .github\scripts\kill_active_ssh_sessions.ps1 +<<<<<<< HEAD +======= + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) libtorch-cuda12_6-shared-with-deps-debug-test: # Testing if: ${{ github.repository_owner == 'pytorch' }} needs: @@ -680,7 +758,11 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7 +======= + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -709,6 +791,20 @@ jobs: # Let's both exclude the path and disable Windows Defender completely just to be sure # that it doesn't interfere Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore +<<<<<<< HEAD +======= + - name: Checkout PyTorch + uses: actions/checkout@v4 + with: + submodules: recursive + path: pytorch + show-progress: false + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # NOTE: These environment variables are put here so that they can be applied on every job equally # They are also here because setting them at a workflow level doesn't give us access to the # runner.temp variable, which we need. @@ -723,6 +819,7 @@ jobs: with: name: libtorch-cuda12_6-shared-with-deps-debug path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" +<<<<<<< HEAD - name: Checkout PyTorch uses: actions/checkout@v4 with: @@ -734,6 +831,8 @@ jobs: # Remove any artifacts from the previous checkouts git clean -fxd working-directory: pytorch +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: Populate binary env shell: bash run: | @@ -798,6 +897,18 @@ jobs: # without this value pip does not get installed for some reason DESIRED_PYTHON: "3.9" steps: +<<<<<<< HEAD +======= + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: Display EC2 information shell: bash run: | @@ -813,7 +924,11 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7 +======= + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -842,6 +957,7 @@ jobs: # Let's both exclude the path and disable Windows Defender completely just to be sure # that it doesn't interfere Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore +<<<<<<< HEAD # NOTE: These environment variables are put here so that they can be applied on every job equally # They are also here because setting them at a workflow level doesn't give us access to the # runner.temp variable, which we need. @@ -851,6 +967,8 @@ jobs: echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: Checkout PyTorch uses: actions/checkout@v4 with: @@ -890,6 +1008,10 @@ jobs: if: always() run: | .github\scripts\kill_active_ssh_sessions.ps1 +<<<<<<< HEAD +======= + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) libtorch-cuda12_8-shared-with-deps-debug-test: # Testing if: ${{ github.repository_owner == 'pytorch' }} needs: @@ -927,7 +1049,11 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7 +======= + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -956,6 +1082,20 @@ jobs: # Let's both exclude the path and disable Windows Defender completely just to be sure # that it doesn't interfere Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore +<<<<<<< HEAD +======= + - name: Checkout PyTorch + uses: actions/checkout@v4 + with: + submodules: recursive + path: pytorch + show-progress: false + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # NOTE: These environment variables are put here so that they can be applied on every job equally # They are also here because setting them at a workflow level doesn't give us access to the # runner.temp variable, which we need. @@ -970,6 +1110,7 @@ jobs: with: name: libtorch-cuda12_8-shared-with-deps-debug path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" +<<<<<<< HEAD - name: Checkout PyTorch uses: actions/checkout@v4 with: @@ -981,6 +1122,8 @@ jobs: # Remove any artifacts from the previous checkouts git clean -fxd working-directory: pytorch +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: Populate binary env shell: bash run: | @@ -1025,3 +1168,254 @@ jobs: secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml +<<<<<<< HEAD +======= + libtorch-cuda12_9-shared-with-deps-debug-build: + if: ${{ github.repository_owner == 'pytorch' }} + needs: get-label-type + runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" + timeout-minutes: 300 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu129 + GPU_ARCH_VERSION: 12.9 + GPU_ARCH_TYPE: cuda + SKIP_ALL_TESTS: 1 + LIBTORCH_CONFIG: debug + LIBTORCH_VARIANT: shared-with-deps + # This is a dummy value for libtorch to work correctly with our batch scripts + # without this value pip does not get installed for some reason + DESIRED_PYTHON: "3.9" + steps: + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + echo "system info $(uname -a)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8 + continue-on-error: true + with: + github-secret: ${{ secrets.GITHUB_TOKEN }} + - name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon + shell: bash + run: | + git config --global core.longpaths true + git config --global core.symlinks true + + # https://git-scm.com/docs/git-fsmonitor--daemon. The daemon could lock + # the directory on Windows and prevent GHA from checking out as reported + # in https://github.com/actions/checkout/issues/1018 + git config --global core.fsmonitor false + # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560 + - name: Enable long paths on Windows + shell: powershell + run: | + Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 + # Since it's just a defensive command, the workflow should continue even the command fails. This step can be + # removed once Windows Defender is removed from the AMI + - name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch + continue-on-error: true + shell: powershell + run: | + Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore + # Let's both exclude the path and disable Windows Defender completely just to be sure + # that it doesn't interfere + Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore + - name: Checkout PyTorch + uses: actions/checkout@v4 + with: + submodules: recursive + path: pytorch + show-progress: false + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Populate binary env + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Build PyTorch binary + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh" + - uses: actions/upload-artifact@v4.4.0 + if: always() + with: + name: libtorch-cuda12_9-shared-with-deps-debug + retention-days: 14 + if-no-files-found: error + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + - name: Wait until all sessions have drained + shell: powershell + working-directory: pytorch + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + working-directory: pytorch + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + + libtorch-cuda12_9-shared-with-deps-debug-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: + - libtorch-cuda12_9-shared-with-deps-debug-build + - get-label-type + runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge" + timeout-minutes: 300 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu129 + GPU_ARCH_VERSION: 12.9 + GPU_ARCH_TYPE: cuda + SKIP_ALL_TESTS: 1 + LIBTORCH_CONFIG: debug + LIBTORCH_VARIANT: shared-with-deps + # This is a dummy value for libtorch to work correctly with our batch scripts + # without this value pip does not get installed for some reason + DESIRED_PYTHON: "3.9" + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + echo "system info $(uname -a)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8 + continue-on-error: true + with: + github-secret: ${{ secrets.GITHUB_TOKEN }} + - name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon + shell: bash + run: | + git config --global core.longpaths true + git config --global core.symlinks true + + # https://git-scm.com/docs/git-fsmonitor--daemon. The daemon could lock + # the directory on Windows and prevent GHA from checking out as reported + # in https://github.com/actions/checkout/issues/1018 + git config --global core.fsmonitor false + # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560 + - name: Enable long paths on Windows + shell: powershell + run: | + Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 + # Since it's just a defensive command, the workflow should continue even the command fails. This step can be + # removed once Windows Defender is removed from the AMI + - name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch + continue-on-error: true + shell: powershell + run: | + Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore + # Let's both exclude the path and disable Windows Defender completely just to be sure + # that it doesn't interfere + Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore + - name: Checkout PyTorch + uses: actions/checkout@v4 + with: + submodules: recursive + path: pytorch + show-progress: false + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" + - uses: actions/download-artifact@v4.1.7 + name: Download Build Artifacts + with: + name: libtorch-cuda12_9-shared-with-deps-debug + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + - name: Populate binary env + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Test PyTorch binary + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh" + - name: Wait until all sessions have drained + shell: powershell + working-directory: pytorch + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + working-directory: pytorch + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + libtorch-cuda12_9-shared-with-deps-debug-upload: # Uploading + if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read + needs: libtorch-cuda12_9-shared-with-deps-debug-test + with: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu129 + GPU_ARCH_VERSION: 12.9 + GPU_ARCH_TYPE: cuda + LIBTORCH_CONFIG: debug + LIBTORCH_VARIANT: shared-with-deps + # This is a dummy value for libtorch to work correctly with our batch scripts + # without this value pip does not get installed for some reason + DESIRED_PYTHON: "3.9" + build_name: libtorch-cuda12_9-shared-with-deps-debug + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + uses: ./.github/workflows/_binary-upload.yml +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) diff --git a/.github/workflows/generated-windows-binary-libtorch-release-main.yml b/.github/workflows/generated-windows-binary-libtorch-release-main.yml index dd8c039761ae..5eda1e8b89cd 100644 --- a/.github/workflows/generated-windows-binary-libtorch-release-main.yml +++ b/.github/workflows/generated-windows-binary-libtorch-release-main.yml @@ -19,6 +19,10 @@ env: PR_NUMBER: ${{ github.event.pull_request.number }} SHA1: ${{ github.event.pull_request.head.sha || github.sha }} SKIP_ALL_TESTS: 1 +<<<<<<< HEAD +======= + OS: windows +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) concurrency: group: windows-binary-libtorch-release-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }} cancel-in-progress: true @@ -27,7 +31,11 @@ jobs: get-label-type: if: github.repository_owner == 'pytorch' name: get-label-type +<<<<<<< HEAD uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.7 +======= + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: triggering_actor: ${{ github.triggering_actor }} issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} @@ -52,6 +60,18 @@ jobs: # without this value pip does not get installed for some reason DESIRED_PYTHON: "3.9" steps: +<<<<<<< HEAD +======= + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: Display EC2 information shell: bash run: | @@ -67,7 +87,11 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7 +======= + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -96,6 +120,7 @@ jobs: # Let's both exclude the path and disable Windows Defender completely just to be sure # that it doesn't interfere Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore +<<<<<<< HEAD # NOTE: These environment variables are put here so that they can be applied on every job equally # They are also here because setting them at a workflow level doesn't give us access to the # runner.temp variable, which we need. @@ -105,6 +130,8 @@ jobs: echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: Checkout PyTorch uses: actions/checkout@v4 with: @@ -144,6 +171,10 @@ jobs: if: always() run: | .github\scripts\kill_active_ssh_sessions.ps1 +<<<<<<< HEAD +======= + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) libtorch-cpu-shared-with-deps-release-test: # Testing if: ${{ github.repository_owner == 'pytorch' }} needs: @@ -180,7 +211,11 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7 +======= + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -209,6 +244,20 @@ jobs: # Let's both exclude the path and disable Windows Defender completely just to be sure # that it doesn't interfere Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore +<<<<<<< HEAD +======= + - name: Checkout PyTorch + uses: actions/checkout@v4 + with: + submodules: recursive + path: pytorch + show-progress: false + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # NOTE: These environment variables are put here so that they can be applied on every job equally # They are also here because setting them at a workflow level doesn't give us access to the # runner.temp variable, which we need. @@ -223,6 +272,7 @@ jobs: with: name: libtorch-cpu-shared-with-deps-release path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" +<<<<<<< HEAD - name: Checkout PyTorch uses: actions/checkout@v4 with: @@ -234,6 +284,8 @@ jobs: # Remove any artifacts from the previous checkouts git clean -fxd working-directory: pytorch +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: Populate binary env shell: bash run: | diff --git a/.github/workflows/generated-windows-binary-libtorch-release-nightly.yml b/.github/workflows/generated-windows-binary-libtorch-release-nightly.yml index 69f16fbaf95b..77b5789c1f82 100644 --- a/.github/workflows/generated-windows-binary-libtorch-release-nightly.yml +++ b/.github/workflows/generated-windows-binary-libtorch-release-nightly.yml @@ -26,6 +26,10 @@ env: PR_NUMBER: ${{ github.event.pull_request.number }} SHA1: ${{ github.event.pull_request.head.sha || github.sha }} SKIP_ALL_TESTS: 1 +<<<<<<< HEAD +======= + OS: windows +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) concurrency: group: windows-binary-libtorch-release-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }} cancel-in-progress: true @@ -34,7 +38,11 @@ jobs: get-label-type: if: github.repository_owner == 'pytorch' name: get-label-type +<<<<<<< HEAD uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.7 +======= + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: triggering_actor: ${{ github.triggering_actor }} issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} @@ -59,6 +67,18 @@ jobs: # without this value pip does not get installed for some reason DESIRED_PYTHON: "3.9" steps: +<<<<<<< HEAD +======= + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: Display EC2 information shell: bash run: | @@ -74,7 +94,11 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7 +======= + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -103,6 +127,7 @@ jobs: # Let's both exclude the path and disable Windows Defender completely just to be sure # that it doesn't interfere Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore +<<<<<<< HEAD # NOTE: These environment variables are put here so that they can be applied on every job equally # They are also here because setting them at a workflow level doesn't give us access to the # runner.temp variable, which we need. @@ -112,6 +137,8 @@ jobs: echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: Checkout PyTorch uses: actions/checkout@v4 with: @@ -151,6 +178,10 @@ jobs: if: always() run: | .github\scripts\kill_active_ssh_sessions.ps1 +<<<<<<< HEAD +======= + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) libtorch-cpu-shared-with-deps-release-test: # Testing if: ${{ github.repository_owner == 'pytorch' }} needs: @@ -187,7 +218,11 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7 +======= + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -216,6 +251,20 @@ jobs: # Let's both exclude the path and disable Windows Defender completely just to be sure # that it doesn't interfere Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore +<<<<<<< HEAD +======= + - name: Checkout PyTorch + uses: actions/checkout@v4 + with: + submodules: recursive + path: pytorch + show-progress: false + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # NOTE: These environment variables are put here so that they can be applied on every job equally # They are also here because setting them at a workflow level doesn't give us access to the # runner.temp variable, which we need. @@ -230,6 +279,7 @@ jobs: with: name: libtorch-cpu-shared-with-deps-release path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" +<<<<<<< HEAD - name: Checkout PyTorch uses: actions/checkout@v4 with: @@ -241,6 +291,8 @@ jobs: # Remove any artifacts from the previous checkouts git clean -fxd working-directory: pytorch +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: Populate binary env shell: bash run: | @@ -284,6 +336,7 @@ jobs: secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml +<<<<<<< HEAD libtorch-cuda11_8-shared-with-deps-release-build: if: ${{ github.repository_owner == 'pytorch' }} needs: get-label-type @@ -531,6 +584,8 @@ jobs: secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) libtorch-cuda12_6-shared-with-deps-release-build: if: ${{ github.repository_owner == 'pytorch' }} needs: get-label-type @@ -551,6 +606,18 @@ jobs: # without this value pip does not get installed for some reason DESIRED_PYTHON: "3.9" steps: +<<<<<<< HEAD +======= + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: Display EC2 information shell: bash run: | @@ -566,7 +633,11 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7 +======= + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -595,6 +666,7 @@ jobs: # Let's both exclude the path and disable Windows Defender completely just to be sure # that it doesn't interfere Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore +<<<<<<< HEAD # NOTE: These environment variables are put here so that they can be applied on every job equally # They are also here because setting them at a workflow level doesn't give us access to the # runner.temp variable, which we need. @@ -604,6 +676,8 @@ jobs: echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: Checkout PyTorch uses: actions/checkout@v4 with: @@ -643,6 +717,10 @@ jobs: if: always() run: | .github\scripts\kill_active_ssh_sessions.ps1 +<<<<<<< HEAD +======= + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) libtorch-cuda12_6-shared-with-deps-release-test: # Testing if: ${{ github.repository_owner == 'pytorch' }} needs: @@ -680,7 +758,11 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7 +======= + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -709,6 +791,20 @@ jobs: # Let's both exclude the path and disable Windows Defender completely just to be sure # that it doesn't interfere Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore +<<<<<<< HEAD +======= + - name: Checkout PyTorch + uses: actions/checkout@v4 + with: + submodules: recursive + path: pytorch + show-progress: false + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # NOTE: These environment variables are put here so that they can be applied on every job equally # They are also here because setting them at a workflow level doesn't give us access to the # runner.temp variable, which we need. @@ -723,6 +819,7 @@ jobs: with: name: libtorch-cuda12_6-shared-with-deps-release path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" +<<<<<<< HEAD - name: Checkout PyTorch uses: actions/checkout@v4 with: @@ -734,6 +831,8 @@ jobs: # Remove any artifacts from the previous checkouts git clean -fxd working-directory: pytorch +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: Populate binary env shell: bash run: | @@ -798,6 +897,18 @@ jobs: # without this value pip does not get installed for some reason DESIRED_PYTHON: "3.9" steps: +<<<<<<< HEAD +======= + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: Display EC2 information shell: bash run: | @@ -813,7 +924,11 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7 +======= + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -842,6 +957,7 @@ jobs: # Let's both exclude the path and disable Windows Defender completely just to be sure # that it doesn't interfere Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore +<<<<<<< HEAD # NOTE: These environment variables are put here so that they can be applied on every job equally # They are also here because setting them at a workflow level doesn't give us access to the # runner.temp variable, which we need. @@ -851,6 +967,8 @@ jobs: echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: Checkout PyTorch uses: actions/checkout@v4 with: @@ -890,6 +1008,10 @@ jobs: if: always() run: | .github\scripts\kill_active_ssh_sessions.ps1 +<<<<<<< HEAD +======= + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) libtorch-cuda12_8-shared-with-deps-release-test: # Testing if: ${{ github.repository_owner == 'pytorch' }} needs: @@ -927,7 +1049,11 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7 +======= + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -956,6 +1082,20 @@ jobs: # Let's both exclude the path and disable Windows Defender completely just to be sure # that it doesn't interfere Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore +<<<<<<< HEAD +======= + - name: Checkout PyTorch + uses: actions/checkout@v4 + with: + submodules: recursive + path: pytorch + show-progress: false + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # NOTE: These environment variables are put here so that they can be applied on every job equally # They are also here because setting them at a workflow level doesn't give us access to the # runner.temp variable, which we need. @@ -970,6 +1110,7 @@ jobs: with: name: libtorch-cuda12_8-shared-with-deps-release path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" +<<<<<<< HEAD - name: Checkout PyTorch uses: actions/checkout@v4 with: @@ -981,6 +1122,8 @@ jobs: # Remove any artifacts from the previous checkouts git clean -fxd working-directory: pytorch +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: Populate binary env shell: bash run: | @@ -1025,3 +1168,254 @@ jobs: secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml +<<<<<<< HEAD +======= + libtorch-cuda12_9-shared-with-deps-release-build: + if: ${{ github.repository_owner == 'pytorch' }} + needs: get-label-type + runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" + timeout-minutes: 300 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu129 + GPU_ARCH_VERSION: 12.9 + GPU_ARCH_TYPE: cuda + SKIP_ALL_TESTS: 1 + LIBTORCH_CONFIG: release + LIBTORCH_VARIANT: shared-with-deps + # This is a dummy value for libtorch to work correctly with our batch scripts + # without this value pip does not get installed for some reason + DESIRED_PYTHON: "3.9" + steps: + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + echo "system info $(uname -a)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8 + continue-on-error: true + with: + github-secret: ${{ secrets.GITHUB_TOKEN }} + - name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon + shell: bash + run: | + git config --global core.longpaths true + git config --global core.symlinks true + + # https://git-scm.com/docs/git-fsmonitor--daemon. The daemon could lock + # the directory on Windows and prevent GHA from checking out as reported + # in https://github.com/actions/checkout/issues/1018 + git config --global core.fsmonitor false + # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560 + - name: Enable long paths on Windows + shell: powershell + run: | + Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 + # Since it's just a defensive command, the workflow should continue even the command fails. This step can be + # removed once Windows Defender is removed from the AMI + - name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch + continue-on-error: true + shell: powershell + run: | + Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore + # Let's both exclude the path and disable Windows Defender completely just to be sure + # that it doesn't interfere + Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore + - name: Checkout PyTorch + uses: actions/checkout@v4 + with: + submodules: recursive + path: pytorch + show-progress: false + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Populate binary env + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Build PyTorch binary + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh" + - uses: actions/upload-artifact@v4.4.0 + if: always() + with: + name: libtorch-cuda12_9-shared-with-deps-release + retention-days: 14 + if-no-files-found: error + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + - name: Wait until all sessions have drained + shell: powershell + working-directory: pytorch + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + working-directory: pytorch + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + + libtorch-cuda12_9-shared-with-deps-release-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: + - libtorch-cuda12_9-shared-with-deps-release-build + - get-label-type + runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge" + timeout-minutes: 300 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu129 + GPU_ARCH_VERSION: 12.9 + GPU_ARCH_TYPE: cuda + SKIP_ALL_TESTS: 1 + LIBTORCH_CONFIG: release + LIBTORCH_VARIANT: shared-with-deps + # This is a dummy value for libtorch to work correctly with our batch scripts + # without this value pip does not get installed for some reason + DESIRED_PYTHON: "3.9" + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + echo "system info $(uname -a)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8 + continue-on-error: true + with: + github-secret: ${{ secrets.GITHUB_TOKEN }} + - name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon + shell: bash + run: | + git config --global core.longpaths true + git config --global core.symlinks true + + # https://git-scm.com/docs/git-fsmonitor--daemon. The daemon could lock + # the directory on Windows and prevent GHA from checking out as reported + # in https://github.com/actions/checkout/issues/1018 + git config --global core.fsmonitor false + # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560 + - name: Enable long paths on Windows + shell: powershell + run: | + Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 + # Since it's just a defensive command, the workflow should continue even the command fails. This step can be + # removed once Windows Defender is removed from the AMI + - name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch + continue-on-error: true + shell: powershell + run: | + Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore + # Let's both exclude the path and disable Windows Defender completely just to be sure + # that it doesn't interfere + Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore + - name: Checkout PyTorch + uses: actions/checkout@v4 + with: + submodules: recursive + path: pytorch + show-progress: false + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" + - uses: actions/download-artifact@v4.1.7 + name: Download Build Artifacts + with: + name: libtorch-cuda12_9-shared-with-deps-release + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + - name: Populate binary env + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Test PyTorch binary + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh" + - name: Wait until all sessions have drained + shell: powershell + working-directory: pytorch + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + working-directory: pytorch + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + libtorch-cuda12_9-shared-with-deps-release-upload: # Uploading + if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read + needs: libtorch-cuda12_9-shared-with-deps-release-test + with: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu129 + GPU_ARCH_VERSION: 12.9 + GPU_ARCH_TYPE: cuda + LIBTORCH_CONFIG: release + LIBTORCH_VARIANT: shared-with-deps + # This is a dummy value for libtorch to work correctly with our batch scripts + # without this value pip does not get installed for some reason + DESIRED_PYTHON: "3.9" + build_name: libtorch-cuda12_9-shared-with-deps-release + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + uses: ./.github/workflows/_binary-upload.yml +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) diff --git a/.github/workflows/generated-windows-binary-wheel-nightly.yml b/.github/workflows/generated-windows-binary-wheel-nightly.yml index ca9fed87cabb..bb8cbcdf8b62 100644 --- a/.github/workflows/generated-windows-binary-wheel-nightly.yml +++ b/.github/workflows/generated-windows-binary-wheel-nightly.yml @@ -26,6 +26,10 @@ env: PR_NUMBER: ${{ github.event.pull_request.number }} SHA1: ${{ github.event.pull_request.head.sha || github.sha }} SKIP_ALL_TESTS: 1 +<<<<<<< HEAD +======= + OS: windows +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) concurrency: group: windows-binary-wheel-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }} cancel-in-progress: true @@ -34,7 +38,11 @@ jobs: get-label-type: if: github.repository_owner == 'pytorch' name: get-label-type +<<<<<<< HEAD uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.7 +======= + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: triggering_actor: ${{ github.triggering_actor }} issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} @@ -55,6 +63,18 @@ jobs: SKIP_ALL_TESTS: 1 DESIRED_PYTHON: "3.9" steps: +<<<<<<< HEAD +======= + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: Display EC2 information shell: bash run: | @@ -70,7 +90,11 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7 +======= + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -99,6 +123,7 @@ jobs: # Let's both exclude the path and disable Windows Defender completely just to be sure # that it doesn't interfere Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore +<<<<<<< HEAD # NOTE: These environment variables are put here so that they can be applied on every job equally # They are also here because setting them at a workflow level doesn't give us access to the # runner.temp variable, which we need. @@ -108,6 +133,8 @@ jobs: echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: Checkout PyTorch uses: actions/checkout@v4 with: @@ -147,6 +174,10 @@ jobs: if: always() run: | .github\scripts\kill_active_ssh_sessions.ps1 +<<<<<<< HEAD +======= + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) wheel-py3_9-cpu-test: # Testing if: ${{ github.repository_owner == 'pytorch' }} needs: @@ -179,7 +210,11 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7 +======= + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -208,6 +243,20 @@ jobs: # Let's both exclude the path and disable Windows Defender completely just to be sure # that it doesn't interfere Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore +<<<<<<< HEAD +======= + - name: Checkout PyTorch + uses: actions/checkout@v4 + with: + submodules: recursive + path: pytorch + show-progress: false + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # NOTE: These environment variables are put here so that they can be applied on every job equally # They are also here because setting them at a workflow level doesn't give us access to the # runner.temp variable, which we need. @@ -222,6 +271,7 @@ jobs: with: name: wheel-py3_9-cpu path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" +<<<<<<< HEAD - name: Checkout PyTorch uses: actions/checkout@v4 with: @@ -233,6 +283,8 @@ jobs: # Remove any artifacts from the previous checkouts git clean -fxd working-directory: pytorch +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: Populate binary env shell: bash run: | @@ -272,6 +324,7 @@ jobs: secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml +<<<<<<< HEAD wheel-py3_9-cuda11_8-build: if: ${{ github.repository_owner == 'pytorch' }} needs: get-label-type @@ -507,6 +560,8 @@ jobs: secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) wheel-py3_9-cuda12_6-build: if: ${{ github.repository_owner == 'pytorch' }} needs: get-label-type @@ -523,6 +578,18 @@ jobs: SKIP_ALL_TESTS: 1 DESIRED_PYTHON: "3.9" steps: +<<<<<<< HEAD +======= + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: Display EC2 information shell: bash run: | @@ -538,7 +605,11 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7 +======= + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -567,6 +638,7 @@ jobs: # Let's both exclude the path and disable Windows Defender completely just to be sure # that it doesn't interfere Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore +<<<<<<< HEAD # NOTE: These environment variables are put here so that they can be applied on every job equally # They are also here because setting them at a workflow level doesn't give us access to the # runner.temp variable, which we need. @@ -576,6 +648,8 @@ jobs: echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: Checkout PyTorch uses: actions/checkout@v4 with: @@ -615,6 +689,10 @@ jobs: if: always() run: | .github\scripts\kill_active_ssh_sessions.ps1 +<<<<<<< HEAD +======= + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) wheel-py3_9-cuda12_6-test: # Testing if: ${{ github.repository_owner == 'pytorch' }} needs: @@ -648,7 +726,11 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7 +======= + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -677,6 +759,20 @@ jobs: # Let's both exclude the path and disable Windows Defender completely just to be sure # that it doesn't interfere Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore +<<<<<<< HEAD +======= + - name: Checkout PyTorch + uses: actions/checkout@v4 + with: + submodules: recursive + path: pytorch + show-progress: false + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # NOTE: These environment variables are put here so that they can be applied on every job equally # They are also here because setting them at a workflow level doesn't give us access to the # runner.temp variable, which we need. @@ -691,6 +787,7 @@ jobs: with: name: wheel-py3_9-cuda12_6 path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" +<<<<<<< HEAD - name: Checkout PyTorch uses: actions/checkout@v4 with: @@ -702,6 +799,8 @@ jobs: # Remove any artifacts from the previous checkouts git clean -fxd working-directory: pytorch +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: Populate binary env shell: bash run: | @@ -758,6 +857,18 @@ jobs: SKIP_ALL_TESTS: 1 DESIRED_PYTHON: "3.9" steps: +<<<<<<< HEAD +======= + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: Display EC2 information shell: bash run: | @@ -773,7 +884,11 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7 +======= + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -802,6 +917,7 @@ jobs: # Let's both exclude the path and disable Windows Defender completely just to be sure # that it doesn't interfere Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore +<<<<<<< HEAD # NOTE: These environment variables are put here so that they can be applied on every job equally # They are also here because setting them at a workflow level doesn't give us access to the # runner.temp variable, which we need. @@ -811,6 +927,8 @@ jobs: echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: Checkout PyTorch uses: actions/checkout@v4 with: @@ -850,6 +968,10 @@ jobs: if: always() run: | .github\scripts\kill_active_ssh_sessions.ps1 +<<<<<<< HEAD +======= + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) wheel-py3_9-cuda12_8-test: # Testing if: ${{ github.repository_owner == 'pytorch' }} needs: @@ -883,7 +1005,11 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7 +======= + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -912,6 +1038,20 @@ jobs: # Let's both exclude the path and disable Windows Defender completely just to be sure # that it doesn't interfere Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore +<<<<<<< HEAD +======= + - name: Checkout PyTorch + uses: actions/checkout@v4 + with: + submodules: recursive + path: pytorch + show-progress: false + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # NOTE: These environment variables are put here so that they can be applied on every job equally # They are also here because setting them at a workflow level doesn't give us access to the # runner.temp variable, which we need. @@ -926,6 +1066,7 @@ jobs: with: name: wheel-py3_9-cuda12_8 path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" +<<<<<<< HEAD - name: Checkout PyTorch uses: actions/checkout@v4 with: @@ -937,6 +1078,8 @@ jobs: # Remove any artifacts from the previous checkouts git clean -fxd working-directory: pytorch +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: Populate binary env shell: bash run: | @@ -977,7 +1120,11 @@ jobs: secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml +<<<<<<< HEAD wheel-py3_9-xpu-build: +======= + wheel-py3_9-cuda12_9-build: +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if: ${{ github.repository_owner == 'pytorch' }} needs: get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" @@ -987,12 +1134,30 @@ jobs: PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION +<<<<<<< HEAD DESIRED_CUDA: xpu GPU_ARCH_TYPE: xpu SKIP_ALL_TESTS: 1 DESIRED_PYTHON: "3.9" PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.0.4; platform_system == 'Linux' | intel-cmplr-lib-ur==2025.0.4; platform_system == 'Linux' | intel-cmplr-lic-rt==2025.0.4; platform_system == 'Linux' | intel-sycl-rt==2025.0.4; platform_system == 'Linux' | intel-cmplr-lib-rt==2025.0.5; platform_system == 'Windows' | intel-cmplr-lib-ur==2025.0.5; platform_system == 'Windows' | intel-cmplr-lic-rt==2025.0.5; platform_system == 'Windows' | intel-sycl-rt==2025.0.5; platform_system == 'Windows' | tcmlib==1.2.0 | umf==0.9.1 | intel-pti==0.10.1 steps: +======= + DESIRED_CUDA: cu129 + GPU_ARCH_VERSION: 12.9 + GPU_ARCH_TYPE: cuda + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.9" + steps: + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: Display EC2 information shell: bash run: | @@ -1008,7 +1173,11 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7 +======= + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -1037,15 +1206,8 @@ jobs: # Let's both exclude the path and disable Windows Defender completely just to be sure # that it doesn't interfere Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore - # NOTE: These environment variables are put here so that they can be applied on every job equally - # They are also here because setting them at a workflow level doesn't give us access to the - # runner.temp variable, which we need. - - name: Populate binary env - shell: bash - run: | - echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" - echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" - echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" +<<<<<<< HEAD +======= - name: Checkout PyTorch uses: actions/checkout@v4 with: @@ -1068,7 +1230,7 @@ jobs: - uses: actions/upload-artifact@v4.4.0 if: always() with: - name: wheel-py3_9-xpu + name: wheel-py3_9-cuda12_9 retention-days: 14 if-no-files-found: error path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" @@ -1085,20 +1247,22 @@ jobs: if: always() run: | .github\scripts\kill_active_ssh_sessions.ps1 - wheel-py3_9-xpu-test: # Testing + + wheel-py3_9-cuda12_9-test: # Testing if: ${{ github.repository_owner == 'pytorch' }} needs: - - wheel-py3_9-xpu-build + - wheel-py3_9-cuda12_9-build - get-label-type - runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" + runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge" timeout-minutes: 300 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: xpu - GPU_ARCH_TYPE: xpu + DESIRED_CUDA: cu129 + GPU_ARCH_VERSION: 12.9 + GPU_ARCH_TYPE: cuda SKIP_ALL_TESTS: 1 DESIRED_PYTHON: "3.9" steps: @@ -1117,7 +1281,7 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7 + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8 continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -1146,6 +1310,18 @@ jobs: # Let's both exclude the path and disable Windows Defender completely just to be sure # that it doesn't interfere Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore + - name: Checkout PyTorch + uses: actions/checkout@v4 + with: + submodules: recursive + path: pytorch + show-progress: false + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # NOTE: These environment variables are put here so that they can be applied on every job equally # They are also here because setting them at a workflow level doesn't give us access to the # runner.temp variable, which we need. @@ -1155,22 +1331,13 @@ jobs: echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" +<<<<<<< HEAD +======= - uses: actions/download-artifact@v4.1.7 name: Download Build Artifacts with: - name: wheel-py3_9-xpu + name: wheel-py3_9-cuda12_9 path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - - name: Checkout PyTorch - uses: actions/checkout@v4 - with: - submodules: recursive - path: pytorch - show-progress: false - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: pytorch - name: Populate binary env shell: bash run: | @@ -1192,25 +1359,26 @@ jobs: if: always() run: | .github\scripts\kill_active_ssh_sessions.ps1 - wheel-py3_9-xpu-upload: # Uploading + wheel-py3_9-cuda12_9-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} permissions: id-token: write contents: read - needs: wheel-py3_9-xpu-test + needs: wheel-py3_9-cuda12_9-test with: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: xpu - GPU_ARCH_TYPE: xpu + DESIRED_CUDA: cu129 + GPU_ARCH_VERSION: 12.9 + GPU_ARCH_TYPE: cuda DESIRED_PYTHON: "3.9" - build_name: wheel-py3_9-xpu + build_name: wheel-py3_9-cuda12_9 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml - wheel-py3_10-cpu-build: + wheel-py3_9-xpu-build: if: ${{ github.repository_owner == 'pytorch' }} needs: get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" @@ -1220,11 +1388,21 @@ jobs: PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cpu - GPU_ARCH_TYPE: cpu + DESIRED_CUDA: xpu + GPU_ARCH_TYPE: xpu SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.10" + DESIRED_PYTHON: "3.9" + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.1.1 | intel-cmplr-lib-ur==2025.1.1 | intel-cmplr-lic-rt==2025.1.1 | intel-sycl-rt==2025.1.1 | oneccl-devel==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.15.0; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.1.0 | onemkl-sycl-dft==2025.1.0 | onemkl-sycl-lapack==2025.1.0 | onemkl-sycl-rng==2025.1.0 | onemkl-sycl-sparse==2025.1.0 | dpcpp-cpp-rt==2025.1.1 | intel-opencl-rt==2025.1.1 | mkl==2025.1.0 | intel-openmp==2025.1.1 | tbb==2022.1.0 | tcmlib==1.3.0 | umf==0.10.0 | intel-pti==0.12.3 steps: + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" - name: Display EC2 information shell: bash run: | @@ -1240,7 +1418,7 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7 + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8 continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -1269,15 +1447,7 @@ jobs: # Let's both exclude the path and disable Windows Defender completely just to be sure # that it doesn't interfere Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore - # NOTE: These environment variables are put here so that they can be applied on every job equally - # They are also here because setting them at a workflow level doesn't give us access to the - # runner.temp variable, which we need. - - name: Populate binary env - shell: bash - run: | - echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" - echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" - echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: Checkout PyTorch uses: actions/checkout@v4 with: @@ -1300,7 +1470,7 @@ jobs: - uses: actions/upload-artifact@v4.4.0 if: always() with: - name: wheel-py3_10-cpu + name: wheel-py3_9-xpu retention-days: 14 if-no-files-found: error path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" @@ -1317,10 +1487,14 @@ jobs: if: always() run: | .github\scripts\kill_active_ssh_sessions.ps1 - wheel-py3_10-cpu-test: # Testing +<<<<<<< HEAD +======= + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) + wheel-py3_9-xpu-test: # Testing if: ${{ github.repository_owner == 'pytorch' }} needs: - - wheel-py3_10-cpu-build + - wheel-py3_9-xpu-build - get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" timeout-minutes: 300 @@ -1329,10 +1503,10 @@ jobs: PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cpu - GPU_ARCH_TYPE: cpu + DESIRED_CUDA: xpu + GPU_ARCH_TYPE: xpu SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.10" + DESIRED_PYTHON: "3.9" steps: - name: Display EC2 information shell: bash @@ -1349,7 +1523,11 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7 +======= + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -1378,6 +1556,20 @@ jobs: # Let's both exclude the path and disable Windows Defender completely just to be sure # that it doesn't interfere Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore +<<<<<<< HEAD +======= + - name: Checkout PyTorch + uses: actions/checkout@v4 + with: + submodules: recursive + path: pytorch + show-progress: false + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # NOTE: These environment variables are put here so that they can be applied on every job equally # They are also here because setting them at a workflow level doesn't give us access to the # runner.temp variable, which we need. @@ -1390,8 +1582,9 @@ jobs: - uses: actions/download-artifact@v4.1.7 name: Download Build Artifacts with: - name: wheel-py3_10-cpu + name: wheel-py3_9-xpu path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" +<<<<<<< HEAD - name: Checkout PyTorch uses: actions/checkout@v4 with: @@ -1403,6 +1596,8 @@ jobs: # Remove any artifacts from the previous checkouts git clean -fxd working-directory: pytorch +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: Populate binary env shell: bash run: | @@ -1424,25 +1619,25 @@ jobs: if: always() run: | .github\scripts\kill_active_ssh_sessions.ps1 - wheel-py3_10-cpu-upload: # Uploading + wheel-py3_9-xpu-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} permissions: id-token: write contents: read - needs: wheel-py3_10-cpu-test + needs: wheel-py3_9-xpu-test with: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cpu - GPU_ARCH_TYPE: cpu - DESIRED_PYTHON: "3.10" - build_name: wheel-py3_10-cpu + DESIRED_CUDA: xpu + GPU_ARCH_TYPE: xpu + DESIRED_PYTHON: "3.9" + build_name: wheel-py3_9-xpu secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml - wheel-py3_10-cuda11_8-build: + wheel-py3_10-cpu-build: if: ${{ github.repository_owner == 'pytorch' }} needs: get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" @@ -1452,12 +1647,23 @@ jobs: PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu118 - GPU_ARCH_VERSION: 11.8 - GPU_ARCH_TYPE: cuda + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu SKIP_ALL_TESTS: 1 DESIRED_PYTHON: "3.10" steps: +<<<<<<< HEAD +======= + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: Display EC2 information shell: bash run: | @@ -1473,7 +1679,11 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7 +======= + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -1502,6 +1712,7 @@ jobs: # Let's both exclude the path and disable Windows Defender completely just to be sure # that it doesn't interfere Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore +<<<<<<< HEAD # NOTE: These environment variables are put here so that they can be applied on every job equally # They are also here because setting them at a workflow level doesn't give us access to the # runner.temp variable, which we need. @@ -1511,6 +1722,8 @@ jobs: echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: Checkout PyTorch uses: actions/checkout@v4 with: @@ -1533,7 +1746,7 @@ jobs: - uses: actions/upload-artifact@v4.4.0 if: always() with: - name: wheel-py3_10-cuda11_8 + name: wheel-py3_10-cpu retention-days: 14 if-no-files-found: error path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" @@ -1550,21 +1763,24 @@ jobs: if: always() run: | .github\scripts\kill_active_ssh_sessions.ps1 - wheel-py3_10-cuda11_8-test: # Testing +<<<<<<< HEAD +======= + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) + wheel-py3_10-cpu-test: # Testing if: ${{ github.repository_owner == 'pytorch' }} needs: - - wheel-py3_10-cuda11_8-build + - wheel-py3_10-cpu-build - get-label-type - runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge" + runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" timeout-minutes: 300 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu118 - GPU_ARCH_VERSION: 11.8 - GPU_ARCH_TYPE: cuda + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu SKIP_ALL_TESTS: 1 DESIRED_PYTHON: "3.10" steps: @@ -1583,7 +1799,11 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7 +======= + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -1612,6 +1832,20 @@ jobs: # Let's both exclude the path and disable Windows Defender completely just to be sure # that it doesn't interfere Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore +<<<<<<< HEAD +======= + - name: Checkout PyTorch + uses: actions/checkout@v4 + with: + submodules: recursive + path: pytorch + show-progress: false + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # NOTE: These environment variables are put here so that they can be applied on every job equally # They are also here because setting them at a workflow level doesn't give us access to the # runner.temp variable, which we need. @@ -1624,8 +1858,9 @@ jobs: - uses: actions/download-artifact@v4.1.7 name: Download Build Artifacts with: - name: wheel-py3_10-cuda11_8 + name: wheel-py3_10-cpu path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" +<<<<<<< HEAD - name: Checkout PyTorch uses: actions/checkout@v4 with: @@ -1637,6 +1872,8 @@ jobs: # Remove any artifacts from the previous checkouts git clean -fxd working-directory: pytorch +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: Populate binary env shell: bash run: | @@ -1658,26 +1895,26 @@ jobs: if: always() run: | .github\scripts\kill_active_ssh_sessions.ps1 - wheel-py3_10-cuda11_8-upload: # Uploading + wheel-py3_10-cpu-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} permissions: id-token: write contents: read - needs: wheel-py3_10-cuda11_8-test + needs: wheel-py3_10-cpu-test with: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu118 - GPU_ARCH_VERSION: 11.8 - GPU_ARCH_TYPE: cuda + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu DESIRED_PYTHON: "3.10" - build_name: wheel-py3_10-cuda11_8 + build_name: wheel-py3_10-cpu secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml - wheel-py3_10-cuda12_6-build: +<<<<<<< HEAD + wheel-py3_10-cuda11_8-build: if: ${{ github.repository_owner == 'pytorch' }} needs: get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" @@ -1687,8 +1924,8 @@ jobs: PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu126 - GPU_ARCH_VERSION: 12.6 + DESIRED_CUDA: cu118 + GPU_ARCH_VERSION: 11.8 GPU_ARCH_TYPE: cuda SKIP_ALL_TESTS: 1 DESIRED_PYTHON: "3.10" @@ -1768,7 +2005,7 @@ jobs: - uses: actions/upload-artifact@v4.4.0 if: always() with: - name: wheel-py3_10-cuda12_6 + name: wheel-py3_10-cuda11_8 retention-days: 14 if-no-files-found: error path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" @@ -1785,10 +2022,10 @@ jobs: if: always() run: | .github\scripts\kill_active_ssh_sessions.ps1 - wheel-py3_10-cuda12_6-test: # Testing + wheel-py3_10-cuda11_8-test: # Testing if: ${{ github.repository_owner == 'pytorch' }} needs: - - wheel-py3_10-cuda12_6-build + - wheel-py3_10-cuda11_8-build - get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge" timeout-minutes: 300 @@ -1797,8 +2034,8 @@ jobs: PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu126 - GPU_ARCH_VERSION: 12.6 + DESIRED_CUDA: cu118 + GPU_ARCH_VERSION: 11.8 GPU_ARCH_TYPE: cuda SKIP_ALL_TESTS: 1 DESIRED_PYTHON: "3.10" @@ -1859,7 +2096,7 @@ jobs: - uses: actions/download-artifact@v4.1.7 name: Download Build Artifacts with: - name: wheel-py3_10-cuda12_6 + name: wheel-py3_10-cuda11_8 path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - name: Checkout PyTorch uses: actions/checkout@v4 @@ -1893,26 +2130,28 @@ jobs: if: always() run: | .github\scripts\kill_active_ssh_sessions.ps1 - wheel-py3_10-cuda12_6-upload: # Uploading + wheel-py3_10-cuda11_8-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} permissions: id-token: write contents: read - needs: wheel-py3_10-cuda12_6-test + needs: wheel-py3_10-cuda11_8-test with: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu126 - GPU_ARCH_VERSION: 12.6 + DESIRED_CUDA: cu118 + GPU_ARCH_VERSION: 11.8 GPU_ARCH_TYPE: cuda DESIRED_PYTHON: "3.10" - build_name: wheel-py3_10-cuda12_6 + build_name: wheel-py3_10-cuda11_8 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml - wheel-py3_10-cuda12_8-build: +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) + wheel-py3_10-cuda12_6-build: if: ${{ github.repository_owner == 'pytorch' }} needs: get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" @@ -1922,12 +2161,24 @@ jobs: PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu128 - GPU_ARCH_VERSION: 12.8 + DESIRED_CUDA: cu126 + GPU_ARCH_VERSION: 12.6 GPU_ARCH_TYPE: cuda SKIP_ALL_TESTS: 1 DESIRED_PYTHON: "3.10" steps: +<<<<<<< HEAD +======= + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: Display EC2 information shell: bash run: | @@ -1943,7 +2194,11 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7 +======= + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -1972,6 +2227,7 @@ jobs: # Let's both exclude the path and disable Windows Defender completely just to be sure # that it doesn't interfere Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore +<<<<<<< HEAD # NOTE: These environment variables are put here so that they can be applied on every job equally # They are also here because setting them at a workflow level doesn't give us access to the # runner.temp variable, which we need. @@ -1981,6 +2237,8 @@ jobs: echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: Checkout PyTorch uses: actions/checkout@v4 with: @@ -2003,7 +2261,7 @@ jobs: - uses: actions/upload-artifact@v4.4.0 if: always() with: - name: wheel-py3_10-cuda12_8 + name: wheel-py3_10-cuda12_6 retention-days: 14 if-no-files-found: error path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" @@ -2020,10 +2278,14 @@ jobs: if: always() run: | .github\scripts\kill_active_ssh_sessions.ps1 - wheel-py3_10-cuda12_8-test: # Testing +<<<<<<< HEAD +======= + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) + wheel-py3_10-cuda12_6-test: # Testing if: ${{ github.repository_owner == 'pytorch' }} needs: - - wheel-py3_10-cuda12_8-build + - wheel-py3_10-cuda12_6-build - get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge" timeout-minutes: 300 @@ -2032,8 +2294,8 @@ jobs: PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu128 - GPU_ARCH_VERSION: 12.8 + DESIRED_CUDA: cu126 + GPU_ARCH_VERSION: 12.6 GPU_ARCH_TYPE: cuda SKIP_ALL_TESTS: 1 DESIRED_PYTHON: "3.10" @@ -2053,7 +2315,11 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7 +======= + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -2082,6 +2348,20 @@ jobs: # Let's both exclude the path and disable Windows Defender completely just to be sure # that it doesn't interfere Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore +<<<<<<< HEAD +======= + - name: Checkout PyTorch + uses: actions/checkout@v4 + with: + submodules: recursive + path: pytorch + show-progress: false + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # NOTE: These environment variables are put here so that they can be applied on every job equally # They are also here because setting them at a workflow level doesn't give us access to the # runner.temp variable, which we need. @@ -2094,8 +2374,9 @@ jobs: - uses: actions/download-artifact@v4.1.7 name: Download Build Artifacts with: - name: wheel-py3_10-cuda12_8 + name: wheel-py3_10-cuda12_6 path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" +<<<<<<< HEAD - name: Checkout PyTorch uses: actions/checkout@v4 with: @@ -2107,6 +2388,8 @@ jobs: # Remove any artifacts from the previous checkouts git clean -fxd working-directory: pytorch +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: Populate binary env shell: bash run: | @@ -2128,26 +2411,26 @@ jobs: if: always() run: | .github\scripts\kill_active_ssh_sessions.ps1 - wheel-py3_10-cuda12_8-upload: # Uploading + wheel-py3_10-cuda12_6-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} permissions: id-token: write contents: read - needs: wheel-py3_10-cuda12_8-test + needs: wheel-py3_10-cuda12_6-test with: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu128 - GPU_ARCH_VERSION: 12.8 + DESIRED_CUDA: cu126 + GPU_ARCH_VERSION: 12.6 GPU_ARCH_TYPE: cuda DESIRED_PYTHON: "3.10" - build_name: wheel-py3_10-cuda12_8 + build_name: wheel-py3_10-cuda12_6 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml - wheel-py3_10-xpu-build: + wheel-py3_10-cuda12_8-build: if: ${{ github.repository_owner == 'pytorch' }} needs: get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" @@ -2157,12 +2440,24 @@ jobs: PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: xpu - GPU_ARCH_TYPE: xpu + DESIRED_CUDA: cu128 + GPU_ARCH_VERSION: 12.8 + GPU_ARCH_TYPE: cuda SKIP_ALL_TESTS: 1 DESIRED_PYTHON: "3.10" - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.0.4; platform_system == 'Linux' | intel-cmplr-lib-ur==2025.0.4; platform_system == 'Linux' | intel-cmplr-lic-rt==2025.0.4; platform_system == 'Linux' | intel-sycl-rt==2025.0.4; platform_system == 'Linux' | intel-cmplr-lib-rt==2025.0.5; platform_system == 'Windows' | intel-cmplr-lib-ur==2025.0.5; platform_system == 'Windows' | intel-cmplr-lic-rt==2025.0.5; platform_system == 'Windows' | intel-sycl-rt==2025.0.5; platform_system == 'Windows' | tcmlib==1.2.0 | umf==0.9.1 | intel-pti==0.10.1 steps: +<<<<<<< HEAD +======= + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: Display EC2 information shell: bash run: | @@ -2178,7 +2473,11 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7 +======= + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -2207,6 +2506,7 @@ jobs: # Let's both exclude the path and disable Windows Defender completely just to be sure # that it doesn't interfere Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore +<<<<<<< HEAD # NOTE: These environment variables are put here so that they can be applied on every job equally # They are also here because setting them at a workflow level doesn't give us access to the # runner.temp variable, which we need. @@ -2216,6 +2516,8 @@ jobs: echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: Checkout PyTorch uses: actions/checkout@v4 with: @@ -2238,7 +2540,7 @@ jobs: - uses: actions/upload-artifact@v4.4.0 if: always() with: - name: wheel-py3_10-xpu + name: wheel-py3_10-cuda12_8 retention-days: 14 if-no-files-found: error path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" @@ -2255,20 +2557,25 @@ jobs: if: always() run: | .github\scripts\kill_active_ssh_sessions.ps1 - wheel-py3_10-xpu-test: # Testing +<<<<<<< HEAD +======= + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) + wheel-py3_10-cuda12_8-test: # Testing if: ${{ github.repository_owner == 'pytorch' }} needs: - - wheel-py3_10-xpu-build + - wheel-py3_10-cuda12_8-build - get-label-type - runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" + runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge" timeout-minutes: 300 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: xpu - GPU_ARCH_TYPE: xpu + DESIRED_CUDA: cu128 + GPU_ARCH_VERSION: 12.8 + GPU_ARCH_TYPE: cuda SKIP_ALL_TESTS: 1 DESIRED_PYTHON: "3.10" steps: @@ -2287,7 +2594,11 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7 +======= + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -2316,6 +2627,20 @@ jobs: # Let's both exclude the path and disable Windows Defender completely just to be sure # that it doesn't interfere Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore +<<<<<<< HEAD +======= + - name: Checkout PyTorch + uses: actions/checkout@v4 + with: + submodules: recursive + path: pytorch + show-progress: false + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # NOTE: These environment variables are put here so that they can be applied on every job equally # They are also here because setting them at a workflow level doesn't give us access to the # runner.temp variable, which we need. @@ -2328,8 +2653,9 @@ jobs: - uses: actions/download-artifact@v4.1.7 name: Download Build Artifacts with: - name: wheel-py3_10-xpu + name: wheel-py3_10-cuda12_8 path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" +<<<<<<< HEAD - name: Checkout PyTorch uses: actions/checkout@v4 with: @@ -2341,6 +2667,8 @@ jobs: # Remove any artifacts from the previous checkouts git clean -fxd working-directory: pytorch +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: Populate binary env shell: bash run: | @@ -2362,25 +2690,30 @@ jobs: if: always() run: | .github\scripts\kill_active_ssh_sessions.ps1 - wheel-py3_10-xpu-upload: # Uploading + wheel-py3_10-cuda12_8-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} permissions: id-token: write contents: read - needs: wheel-py3_10-xpu-test + needs: wheel-py3_10-cuda12_8-test with: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: xpu - GPU_ARCH_TYPE: xpu + DESIRED_CUDA: cu128 + GPU_ARCH_VERSION: 12.8 + GPU_ARCH_TYPE: cuda DESIRED_PYTHON: "3.10" - build_name: wheel-py3_10-xpu + build_name: wheel-py3_10-cuda12_8 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml - wheel-py3_11-cpu-build: +<<<<<<< HEAD + wheel-py3_10-xpu-build: +======= + wheel-py3_10-cuda12_9-build: +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if: ${{ github.repository_owner == 'pytorch' }} needs: get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" @@ -2390,11 +2723,30 @@ jobs: PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cpu - GPU_ARCH_TYPE: cpu +<<<<<<< HEAD + DESIRED_CUDA: xpu + GPU_ARCH_TYPE: xpu SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.11" + DESIRED_PYTHON: "3.10" + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.0.4; platform_system == 'Linux' | intel-cmplr-lib-ur==2025.0.4; platform_system == 'Linux' | intel-cmplr-lic-rt==2025.0.4; platform_system == 'Linux' | intel-sycl-rt==2025.0.4; platform_system == 'Linux' | intel-cmplr-lib-rt==2025.0.5; platform_system == 'Windows' | intel-cmplr-lib-ur==2025.0.5; platform_system == 'Windows' | intel-cmplr-lic-rt==2025.0.5; platform_system == 'Windows' | intel-sycl-rt==2025.0.5; platform_system == 'Windows' | tcmlib==1.2.0 | umf==0.9.1 | intel-pti==0.10.1 steps: +======= + DESIRED_CUDA: cu129 + GPU_ARCH_VERSION: 12.9 + GPU_ARCH_TYPE: cuda + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.10" + steps: + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: Display EC2 information shell: bash run: | @@ -2410,7 +2762,11 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7 +======= + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -2439,15 +2795,8 @@ jobs: # Let's both exclude the path and disable Windows Defender completely just to be sure # that it doesn't interfere Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore - # NOTE: These environment variables are put here so that they can be applied on every job equally - # They are also here because setting them at a workflow level doesn't give us access to the - # runner.temp variable, which we need. - - name: Populate binary env - shell: bash - run: | - echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" - echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" - echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" +<<<<<<< HEAD +======= - name: Checkout PyTorch uses: actions/checkout@v4 with: @@ -2470,7 +2819,7 @@ jobs: - uses: actions/upload-artifact@v4.4.0 if: always() with: - name: wheel-py3_11-cpu + name: wheel-py3_10-cuda12_9 retention-days: 14 if-no-files-found: error path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" @@ -2487,22 +2836,24 @@ jobs: if: always() run: | .github\scripts\kill_active_ssh_sessions.ps1 - wheel-py3_11-cpu-test: # Testing + + wheel-py3_10-cuda12_9-test: # Testing if: ${{ github.repository_owner == 'pytorch' }} needs: - - wheel-py3_11-cpu-build + - wheel-py3_10-cuda12_9-build - get-label-type - runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" + runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge" timeout-minutes: 300 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cpu - GPU_ARCH_TYPE: cpu + DESIRED_CUDA: cu129 + GPU_ARCH_VERSION: 12.9 + GPU_ARCH_TYPE: cuda SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.11" + DESIRED_PYTHON: "3.10" steps: - name: Display EC2 information shell: bash @@ -2519,7 +2870,7 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7 + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8 continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -2548,6 +2899,18 @@ jobs: # Let's both exclude the path and disable Windows Defender completely just to be sure # that it doesn't interfere Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore + - name: Checkout PyTorch + uses: actions/checkout@v4 + with: + submodules: recursive + path: pytorch + show-progress: false + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # NOTE: These environment variables are put here so that they can be applied on every job equally # They are also here because setting them at a workflow level doesn't give us access to the # runner.temp variable, which we need. @@ -2557,22 +2920,1602 @@ jobs: echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" +<<<<<<< HEAD +======= - uses: actions/download-artifact@v4.1.7 name: Download Build Artifacts with: - name: wheel-py3_11-cpu + name: wheel-py3_10-cuda12_9 path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - - name: Checkout PyTorch - uses: actions/checkout@v4 - with: - submodules: recursive - path: pytorch - show-progress: false - - name: Clean PyTorch checkout + - name: Populate binary env + shell: bash run: | - # Remove any artifacts from the previous checkouts - git clean -fxd + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Test PyTorch binary + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh" + - name: Wait until all sessions have drained + shell: powershell working-directory: pytorch + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + working-directory: pytorch + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + wheel-py3_10-cuda12_9-upload: # Uploading + if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read + needs: wheel-py3_10-cuda12_9-test + with: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + PACKAGE_TYPE: wheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu129 + GPU_ARCH_VERSION: 12.9 + GPU_ARCH_TYPE: cuda + DESIRED_PYTHON: "3.10" + build_name: wheel-py3_10-cuda12_9 + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + uses: ./.github/workflows/_binary-upload.yml + wheel-py3_10-xpu-build: + if: ${{ github.repository_owner == 'pytorch' }} + needs: get-label-type + runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" + timeout-minutes: 300 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + PACKAGE_TYPE: wheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: xpu + GPU_ARCH_TYPE: xpu + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.10" + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.1.1 | intel-cmplr-lib-ur==2025.1.1 | intel-cmplr-lic-rt==2025.1.1 | intel-sycl-rt==2025.1.1 | oneccl-devel==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.15.0; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.1.0 | onemkl-sycl-dft==2025.1.0 | onemkl-sycl-lapack==2025.1.0 | onemkl-sycl-rng==2025.1.0 | onemkl-sycl-sparse==2025.1.0 | dpcpp-cpp-rt==2025.1.1 | intel-opencl-rt==2025.1.1 | mkl==2025.1.0 | intel-openmp==2025.1.1 | tbb==2022.1.0 | tcmlib==1.3.0 | umf==0.10.0 | intel-pti==0.12.3 + steps: + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + echo "system info $(uname -a)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8 + continue-on-error: true + with: + github-secret: ${{ secrets.GITHUB_TOKEN }} + - name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon + shell: bash + run: | + git config --global core.longpaths true + git config --global core.symlinks true + + # https://git-scm.com/docs/git-fsmonitor--daemon. The daemon could lock + # the directory on Windows and prevent GHA from checking out as reported + # in https://github.com/actions/checkout/issues/1018 + git config --global core.fsmonitor false + # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560 + - name: Enable long paths on Windows + shell: powershell + run: | + Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 + # Since it's just a defensive command, the workflow should continue even the command fails. This step can be + # removed once Windows Defender is removed from the AMI + - name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch + continue-on-error: true + shell: powershell + run: | + Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore + # Let's both exclude the path and disable Windows Defender completely just to be sure + # that it doesn't interfere + Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) + - name: Checkout PyTorch + uses: actions/checkout@v4 + with: + submodules: recursive + path: pytorch + show-progress: false + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Populate binary env + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Build PyTorch binary + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh" + - uses: actions/upload-artifact@v4.4.0 + if: always() + with: + name: wheel-py3_10-xpu + retention-days: 14 + if-no-files-found: error + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + - name: Wait until all sessions have drained + shell: powershell + working-directory: pytorch + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + working-directory: pytorch + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 +<<<<<<< HEAD +======= + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) + wheel-py3_10-xpu-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: + - wheel-py3_10-xpu-build + - get-label-type + runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" + timeout-minutes: 300 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + PACKAGE_TYPE: wheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: xpu + GPU_ARCH_TYPE: xpu + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.10" + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + echo "system info $(uname -a)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" +<<<<<<< HEAD + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7 +======= + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) + continue-on-error: true + with: + github-secret: ${{ secrets.GITHUB_TOKEN }} + - name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon + shell: bash + run: | + git config --global core.longpaths true + git config --global core.symlinks true + + # https://git-scm.com/docs/git-fsmonitor--daemon. The daemon could lock + # the directory on Windows and prevent GHA from checking out as reported + # in https://github.com/actions/checkout/issues/1018 + git config --global core.fsmonitor false + # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560 + - name: Enable long paths on Windows + shell: powershell + run: | + Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 + # Since it's just a defensive command, the workflow should continue even the command fails. This step can be + # removed once Windows Defender is removed from the AMI + - name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch + continue-on-error: true + shell: powershell + run: | + Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore + # Let's both exclude the path and disable Windows Defender completely just to be sure + # that it doesn't interfere + Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore +<<<<<<< HEAD +======= + - name: Checkout PyTorch + uses: actions/checkout@v4 + with: + submodules: recursive + path: pytorch + show-progress: false + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" + - uses: actions/download-artifact@v4.1.7 + name: Download Build Artifacts + with: + name: wheel-py3_10-xpu + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" +<<<<<<< HEAD + - name: Checkout PyTorch + uses: actions/checkout@v4 + with: + submodules: recursive + path: pytorch + show-progress: false + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) + - name: Populate binary env + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Test PyTorch binary + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh" + - name: Wait until all sessions have drained + shell: powershell + working-directory: pytorch + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + working-directory: pytorch + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + wheel-py3_10-xpu-upload: # Uploading + if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read + needs: wheel-py3_10-xpu-test + with: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + PACKAGE_TYPE: wheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: xpu + GPU_ARCH_TYPE: xpu + DESIRED_PYTHON: "3.10" + build_name: wheel-py3_10-xpu + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + uses: ./.github/workflows/_binary-upload.yml + wheel-py3_11-cpu-build: + if: ${{ github.repository_owner == 'pytorch' }} + needs: get-label-type + runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" + timeout-minutes: 300 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + PACKAGE_TYPE: wheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.11" + steps: +<<<<<<< HEAD +======= + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + echo "system info $(uname -a)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" +<<<<<<< HEAD + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7 +======= + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) + continue-on-error: true + with: + github-secret: ${{ secrets.GITHUB_TOKEN }} + - name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon + shell: bash + run: | + git config --global core.longpaths true + git config --global core.symlinks true + + # https://git-scm.com/docs/git-fsmonitor--daemon. The daemon could lock + # the directory on Windows and prevent GHA from checking out as reported + # in https://github.com/actions/checkout/issues/1018 + git config --global core.fsmonitor false + # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560 + - name: Enable long paths on Windows + shell: powershell + run: | + Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 + # Since it's just a defensive command, the workflow should continue even the command fails. This step can be + # removed once Windows Defender is removed from the AMI + - name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch + continue-on-error: true + shell: powershell + run: | + Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore + # Let's both exclude the path and disable Windows Defender completely just to be sure + # that it doesn't interfere + Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore +<<<<<<< HEAD + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) + - name: Checkout PyTorch + uses: actions/checkout@v4 + with: + submodules: recursive + path: pytorch + show-progress: false + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Populate binary env + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Build PyTorch binary + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh" + - uses: actions/upload-artifact@v4.4.0 + if: always() + with: + name: wheel-py3_11-cpu + retention-days: 14 + if-no-files-found: error + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + - name: Wait until all sessions have drained + shell: powershell + working-directory: pytorch + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + working-directory: pytorch + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 +<<<<<<< HEAD +======= + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) + wheel-py3_11-cpu-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: + - wheel-py3_11-cpu-build + - get-label-type + runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" + timeout-minutes: 300 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + PACKAGE_TYPE: wheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.11" + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + echo "system info $(uname -a)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" +<<<<<<< HEAD + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7 +======= + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) + continue-on-error: true + with: + github-secret: ${{ secrets.GITHUB_TOKEN }} + - name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon + shell: bash + run: | + git config --global core.longpaths true + git config --global core.symlinks true + + # https://git-scm.com/docs/git-fsmonitor--daemon. The daemon could lock + # the directory on Windows and prevent GHA from checking out as reported + # in https://github.com/actions/checkout/issues/1018 + git config --global core.fsmonitor false + # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560 + - name: Enable long paths on Windows + shell: powershell + run: | + Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 + # Since it's just a defensive command, the workflow should continue even the command fails. This step can be + # removed once Windows Defender is removed from the AMI + - name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch + continue-on-error: true + shell: powershell + run: | + Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore + # Let's both exclude the path and disable Windows Defender completely just to be sure + # that it doesn't interfere + Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore +<<<<<<< HEAD +======= + - name: Checkout PyTorch + uses: actions/checkout@v4 + with: + submodules: recursive + path: pytorch + show-progress: false + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" + - uses: actions/download-artifact@v4.1.7 + name: Download Build Artifacts + with: + name: wheel-py3_11-cpu + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" +<<<<<<< HEAD + - name: Checkout PyTorch + uses: actions/checkout@v4 + with: + submodules: recursive + path: pytorch + show-progress: false + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) + - name: Populate binary env + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Test PyTorch binary + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh" + - name: Wait until all sessions have drained + shell: powershell + working-directory: pytorch + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + working-directory: pytorch + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + wheel-py3_11-cpu-upload: # Uploading + if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read + needs: wheel-py3_11-cpu-test + with: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + PACKAGE_TYPE: wheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + DESIRED_PYTHON: "3.11" + build_name: wheel-py3_11-cpu + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + uses: ./.github/workflows/_binary-upload.yml +<<<<<<< HEAD + wheel-py3_11-cuda11_8-build: + if: ${{ github.repository_owner == 'pytorch' }} + needs: get-label-type + runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" + timeout-minutes: 300 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + PACKAGE_TYPE: wheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu118 + GPU_ARCH_VERSION: 11.8 + GPU_ARCH_TYPE: cuda + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.11" + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + echo "system info $(uname -a)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7 + continue-on-error: true + with: + github-secret: ${{ secrets.GITHUB_TOKEN }} + - name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon + shell: bash + run: | + git config --global core.longpaths true + git config --global core.symlinks true + + # https://git-scm.com/docs/git-fsmonitor--daemon. The daemon could lock + # the directory on Windows and prevent GHA from checking out as reported + # in https://github.com/actions/checkout/issues/1018 + git config --global core.fsmonitor false + # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560 + - name: Enable long paths on Windows + shell: powershell + run: | + Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 + # Since it's just a defensive command, the workflow should continue even the command fails. This step can be + # removed once Windows Defender is removed from the AMI + - name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch + continue-on-error: true + shell: powershell + run: | + Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore + # Let's both exclude the path and disable Windows Defender completely just to be sure + # that it doesn't interfere + Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" + - name: Checkout PyTorch + uses: actions/checkout@v4 + with: + submodules: recursive + path: pytorch + show-progress: false + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Populate binary env + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Build PyTorch binary + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh" + - uses: actions/upload-artifact@v4.4.0 + if: always() + with: + name: wheel-py3_11-cuda11_8 + retention-days: 14 + if-no-files-found: error + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + - name: Wait until all sessions have drained + shell: powershell + working-directory: pytorch + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + working-directory: pytorch + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + wheel-py3_11-cuda11_8-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: + - wheel-py3_11-cuda11_8-build + - get-label-type + runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge" + timeout-minutes: 300 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + PACKAGE_TYPE: wheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu118 + GPU_ARCH_VERSION: 11.8 + GPU_ARCH_TYPE: cuda + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.11" + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + echo "system info $(uname -a)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7 + continue-on-error: true + with: + github-secret: ${{ secrets.GITHUB_TOKEN }} + - name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon + shell: bash + run: | + git config --global core.longpaths true + git config --global core.symlinks true + + # https://git-scm.com/docs/git-fsmonitor--daemon. The daemon could lock + # the directory on Windows and prevent GHA from checking out as reported + # in https://github.com/actions/checkout/issues/1018 + git config --global core.fsmonitor false + # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560 + - name: Enable long paths on Windows + shell: powershell + run: | + Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 + # Since it's just a defensive command, the workflow should continue even the command fails. This step can be + # removed once Windows Defender is removed from the AMI + - name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch + continue-on-error: true + shell: powershell + run: | + Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore + # Let's both exclude the path and disable Windows Defender completely just to be sure + # that it doesn't interfere + Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" + - uses: actions/download-artifact@v4.1.7 + name: Download Build Artifacts + with: + name: wheel-py3_11-cuda11_8 + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + - name: Checkout PyTorch + uses: actions/checkout@v4 + with: + submodules: recursive + path: pytorch + show-progress: false + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Populate binary env + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Test PyTorch binary + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh" + - name: Wait until all sessions have drained + shell: powershell + working-directory: pytorch + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + working-directory: pytorch + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + wheel-py3_11-cuda11_8-upload: # Uploading + if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read + needs: wheel-py3_11-cuda11_8-test + with: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + PACKAGE_TYPE: wheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu118 + GPU_ARCH_VERSION: 11.8 + GPU_ARCH_TYPE: cuda + DESIRED_PYTHON: "3.11" + build_name: wheel-py3_11-cuda11_8 + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + uses: ./.github/workflows/_binary-upload.yml +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) + wheel-py3_11-cuda12_6-build: + if: ${{ github.repository_owner == 'pytorch' }} + needs: get-label-type + runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" + timeout-minutes: 300 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + PACKAGE_TYPE: wheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu126 + GPU_ARCH_VERSION: 12.6 + GPU_ARCH_TYPE: cuda + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.11" + steps: +<<<<<<< HEAD +======= + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + echo "system info $(uname -a)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" +<<<<<<< HEAD + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7 +======= + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) + continue-on-error: true + with: + github-secret: ${{ secrets.GITHUB_TOKEN }} + - name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon + shell: bash + run: | + git config --global core.longpaths true + git config --global core.symlinks true + + # https://git-scm.com/docs/git-fsmonitor--daemon. The daemon could lock + # the directory on Windows and prevent GHA from checking out as reported + # in https://github.com/actions/checkout/issues/1018 + git config --global core.fsmonitor false + # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560 + - name: Enable long paths on Windows + shell: powershell + run: | + Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 + # Since it's just a defensive command, the workflow should continue even the command fails. This step can be + # removed once Windows Defender is removed from the AMI + - name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch + continue-on-error: true + shell: powershell + run: | + Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore + # Let's both exclude the path and disable Windows Defender completely just to be sure + # that it doesn't interfere + Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore +<<<<<<< HEAD + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) + - name: Checkout PyTorch + uses: actions/checkout@v4 + with: + submodules: recursive + path: pytorch + show-progress: false + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Populate binary env + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Build PyTorch binary + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh" + - uses: actions/upload-artifact@v4.4.0 + if: always() + with: + name: wheel-py3_11-cuda12_6 + retention-days: 14 + if-no-files-found: error + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + - name: Wait until all sessions have drained + shell: powershell + working-directory: pytorch + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + working-directory: pytorch + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 +<<<<<<< HEAD +======= + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) + wheel-py3_11-cuda12_6-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: + - wheel-py3_11-cuda12_6-build + - get-label-type + runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge" + timeout-minutes: 300 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + PACKAGE_TYPE: wheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu126 + GPU_ARCH_VERSION: 12.6 + GPU_ARCH_TYPE: cuda + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.11" + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + echo "system info $(uname -a)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" +<<<<<<< HEAD + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7 +======= + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) + continue-on-error: true + with: + github-secret: ${{ secrets.GITHUB_TOKEN }} + - name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon + shell: bash + run: | + git config --global core.longpaths true + git config --global core.symlinks true + + # https://git-scm.com/docs/git-fsmonitor--daemon. The daemon could lock + # the directory on Windows and prevent GHA from checking out as reported + # in https://github.com/actions/checkout/issues/1018 + git config --global core.fsmonitor false + # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560 + - name: Enable long paths on Windows + shell: powershell + run: | + Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 + # Since it's just a defensive command, the workflow should continue even the command fails. This step can be + # removed once Windows Defender is removed from the AMI + - name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch + continue-on-error: true + shell: powershell + run: | + Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore + # Let's both exclude the path and disable Windows Defender completely just to be sure + # that it doesn't interfere + Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore +<<<<<<< HEAD +======= + - name: Checkout PyTorch + uses: actions/checkout@v4 + with: + submodules: recursive + path: pytorch + show-progress: false + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" + - uses: actions/download-artifact@v4.1.7 + name: Download Build Artifacts + with: + name: wheel-py3_11-cuda12_6 + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" +<<<<<<< HEAD + - name: Checkout PyTorch + uses: actions/checkout@v4 + with: + submodules: recursive + path: pytorch + show-progress: false + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) + - name: Populate binary env + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Test PyTorch binary + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh" + - name: Wait until all sessions have drained + shell: powershell + working-directory: pytorch + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + working-directory: pytorch + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + wheel-py3_11-cuda12_6-upload: # Uploading + if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read + needs: wheel-py3_11-cuda12_6-test + with: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + PACKAGE_TYPE: wheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu126 + GPU_ARCH_VERSION: 12.6 + GPU_ARCH_TYPE: cuda + DESIRED_PYTHON: "3.11" + build_name: wheel-py3_11-cuda12_6 + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + uses: ./.github/workflows/_binary-upload.yml + wheel-py3_11-cuda12_8-build: + if: ${{ github.repository_owner == 'pytorch' }} + needs: get-label-type + runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" + timeout-minutes: 300 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + PACKAGE_TYPE: wheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu128 + GPU_ARCH_VERSION: 12.8 + GPU_ARCH_TYPE: cuda + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.11" + steps: +<<<<<<< HEAD +======= + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + echo "system info $(uname -a)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" +<<<<<<< HEAD + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7 +======= + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) + continue-on-error: true + with: + github-secret: ${{ secrets.GITHUB_TOKEN }} + - name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon + shell: bash + run: | + git config --global core.longpaths true + git config --global core.symlinks true + + # https://git-scm.com/docs/git-fsmonitor--daemon. The daemon could lock + # the directory on Windows and prevent GHA from checking out as reported + # in https://github.com/actions/checkout/issues/1018 + git config --global core.fsmonitor false + # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560 + - name: Enable long paths on Windows + shell: powershell + run: | + Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 + # Since it's just a defensive command, the workflow should continue even the command fails. This step can be + # removed once Windows Defender is removed from the AMI + - name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch + continue-on-error: true + shell: powershell + run: | + Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore + # Let's both exclude the path and disable Windows Defender completely just to be sure + # that it doesn't interfere + Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore +<<<<<<< HEAD + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) + - name: Checkout PyTorch + uses: actions/checkout@v4 + with: + submodules: recursive + path: pytorch + show-progress: false + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Populate binary env + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Build PyTorch binary + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh" + - uses: actions/upload-artifact@v4.4.0 + if: always() + with: + name: wheel-py3_11-cuda12_8 + retention-days: 14 + if-no-files-found: error + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + - name: Wait until all sessions have drained + shell: powershell + working-directory: pytorch + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + working-directory: pytorch + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 +<<<<<<< HEAD +======= + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) + wheel-py3_11-cuda12_8-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: + - wheel-py3_11-cuda12_8-build + - get-label-type + runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge" + timeout-minutes: 300 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + PACKAGE_TYPE: wheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu128 + GPU_ARCH_VERSION: 12.8 + GPU_ARCH_TYPE: cuda + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.11" + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + echo "system info $(uname -a)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" +<<<<<<< HEAD + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7 +======= + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) + continue-on-error: true + with: + github-secret: ${{ secrets.GITHUB_TOKEN }} + - name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon + shell: bash + run: | + git config --global core.longpaths true + git config --global core.symlinks true + + # https://git-scm.com/docs/git-fsmonitor--daemon. The daemon could lock + # the directory on Windows and prevent GHA from checking out as reported + # in https://github.com/actions/checkout/issues/1018 + git config --global core.fsmonitor false + # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560 + - name: Enable long paths on Windows + shell: powershell + run: | + Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 + # Since it's just a defensive command, the workflow should continue even the command fails. This step can be + # removed once Windows Defender is removed from the AMI + - name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch + continue-on-error: true + shell: powershell + run: | + Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore + # Let's both exclude the path and disable Windows Defender completely just to be sure + # that it doesn't interfere + Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore +<<<<<<< HEAD +======= + - name: Checkout PyTorch + uses: actions/checkout@v4 + with: + submodules: recursive + path: pytorch + show-progress: false + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" + - uses: actions/download-artifact@v4.1.7 + name: Download Build Artifacts + with: + name: wheel-py3_11-cuda12_8 + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" +<<<<<<< HEAD + - name: Checkout PyTorch + uses: actions/checkout@v4 + with: + submodules: recursive + path: pytorch + show-progress: false + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) + - name: Populate binary env + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Test PyTorch binary + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh" + - name: Wait until all sessions have drained + shell: powershell + working-directory: pytorch + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + working-directory: pytorch + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + wheel-py3_11-cuda12_8-upload: # Uploading + if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read + needs: wheel-py3_11-cuda12_8-test + with: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + PACKAGE_TYPE: wheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu128 + GPU_ARCH_VERSION: 12.8 + GPU_ARCH_TYPE: cuda + DESIRED_PYTHON: "3.11" + build_name: wheel-py3_11-cuda12_8 + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + uses: ./.github/workflows/_binary-upload.yml +<<<<<<< HEAD + wheel-py3_11-xpu-build: +======= + wheel-py3_11-cuda12_9-build: +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) + if: ${{ github.repository_owner == 'pytorch' }} + needs: get-label-type + runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" + timeout-minutes: 300 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + PACKAGE_TYPE: wheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION +<<<<<<< HEAD + DESIRED_CUDA: xpu + GPU_ARCH_TYPE: xpu + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.11" + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.0.4; platform_system == 'Linux' | intel-cmplr-lib-ur==2025.0.4; platform_system == 'Linux' | intel-cmplr-lic-rt==2025.0.4; platform_system == 'Linux' | intel-sycl-rt==2025.0.4; platform_system == 'Linux' | intel-cmplr-lib-rt==2025.0.5; platform_system == 'Windows' | intel-cmplr-lib-ur==2025.0.5; platform_system == 'Windows' | intel-cmplr-lic-rt==2025.0.5; platform_system == 'Windows' | intel-sycl-rt==2025.0.5; platform_system == 'Windows' | tcmlib==1.2.0 | umf==0.9.1 | intel-pti==0.10.1 + steps: +======= + DESIRED_CUDA: cu129 + GPU_ARCH_VERSION: 12.9 + GPU_ARCH_TYPE: cuda + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.11" + steps: + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + echo "system info $(uname -a)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" +<<<<<<< HEAD + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7 +======= + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) + continue-on-error: true + with: + github-secret: ${{ secrets.GITHUB_TOKEN }} + - name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon + shell: bash + run: | + git config --global core.longpaths true + git config --global core.symlinks true + + # https://git-scm.com/docs/git-fsmonitor--daemon. The daemon could lock + # the directory on Windows and prevent GHA from checking out as reported + # in https://github.com/actions/checkout/issues/1018 + git config --global core.fsmonitor false + # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560 + - name: Enable long paths on Windows + shell: powershell + run: | + Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 + # Since it's just a defensive command, the workflow should continue even the command fails. This step can be + # removed once Windows Defender is removed from the AMI + - name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch + continue-on-error: true + shell: powershell + run: | + Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore + # Let's both exclude the path and disable Windows Defender completely just to be sure + # that it doesn't interfere + Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore +<<<<<<< HEAD +======= + - name: Checkout PyTorch + uses: actions/checkout@v4 + with: + submodules: recursive + path: pytorch + show-progress: false + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Populate binary env + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Build PyTorch binary + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh" + - uses: actions/upload-artifact@v4.4.0 + if: always() + with: + name: wheel-py3_11-cuda12_9 + retention-days: 14 + if-no-files-found: error + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + - name: Wait until all sessions have drained + shell: powershell + working-directory: pytorch + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + working-directory: pytorch + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + + wheel-py3_11-cuda12_9-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: + - wheel-py3_11-cuda12_9-build + - get-label-type + runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge" + timeout-minutes: 300 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + PACKAGE_TYPE: wheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu129 + GPU_ARCH_VERSION: 12.9 + GPU_ARCH_TYPE: cuda + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.11" + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + echo "system info $(uname -a)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8 + continue-on-error: true + with: + github-secret: ${{ secrets.GITHUB_TOKEN }} + - name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon + shell: bash + run: | + git config --global core.longpaths true + git config --global core.symlinks true + + # https://git-scm.com/docs/git-fsmonitor--daemon. The daemon could lock + # the directory on Windows and prevent GHA from checking out as reported + # in https://github.com/actions/checkout/issues/1018 + git config --global core.fsmonitor false + # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560 + - name: Enable long paths on Windows + shell: powershell + run: | + Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 + # Since it's just a defensive command, the workflow should continue even the command fails. This step can be + # removed once Windows Defender is removed from the AMI + - name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch + continue-on-error: true + shell: powershell + run: | + Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore + # Let's both exclude the path and disable Windows Defender completely just to be sure + # that it doesn't interfere + Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore + - name: Checkout PyTorch + uses: actions/checkout@v4 + with: + submodules: recursive + path: pytorch + show-progress: false + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" +<<<<<<< HEAD +======= + - uses: actions/download-artifact@v4.1.7 + name: Download Build Artifacts + with: + name: wheel-py3_11-cuda12_9 + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - name: Populate binary env shell: bash run: | @@ -2594,25 +4537,26 @@ jobs: if: always() run: | .github\scripts\kill_active_ssh_sessions.ps1 - wheel-py3_11-cpu-upload: # Uploading + wheel-py3_11-cuda12_9-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} permissions: id-token: write contents: read - needs: wheel-py3_11-cpu-test + needs: wheel-py3_11-cuda12_9-test with: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cpu - GPU_ARCH_TYPE: cpu + DESIRED_CUDA: cu129 + GPU_ARCH_VERSION: 12.9 + GPU_ARCH_TYPE: cuda DESIRED_PYTHON: "3.11" - build_name: wheel-py3_11-cpu + build_name: wheel-py3_11-cuda12_9 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml - wheel-py3_11-cuda11_8-build: + wheel-py3_11-xpu-build: if: ${{ github.repository_owner == 'pytorch' }} needs: get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" @@ -2622,12 +4566,21 @@ jobs: PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu118 - GPU_ARCH_VERSION: 11.8 - GPU_ARCH_TYPE: cuda + DESIRED_CUDA: xpu + GPU_ARCH_TYPE: xpu SKIP_ALL_TESTS: 1 DESIRED_PYTHON: "3.11" + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.1.1 | intel-cmplr-lib-ur==2025.1.1 | intel-cmplr-lic-rt==2025.1.1 | intel-sycl-rt==2025.1.1 | oneccl-devel==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.15.0; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.1.0 | onemkl-sycl-dft==2025.1.0 | onemkl-sycl-lapack==2025.1.0 | onemkl-sycl-rng==2025.1.0 | onemkl-sycl-sparse==2025.1.0 | dpcpp-cpp-rt==2025.1.1 | intel-opencl-rt==2025.1.1 | mkl==2025.1.0 | intel-openmp==2025.1.1 | tbb==2022.1.0 | tcmlib==1.3.0 | umf==0.10.0 | intel-pti==0.12.3 steps: + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" - name: Display EC2 information shell: bash run: | @@ -2643,7 +4596,7 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7 + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8 continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -2672,15 +4625,7 @@ jobs: # Let's both exclude the path and disable Windows Defender completely just to be sure # that it doesn't interfere Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore - # NOTE: These environment variables are put here so that they can be applied on every job equally - # They are also here because setting them at a workflow level doesn't give us access to the - # runner.temp variable, which we need. - - name: Populate binary env - shell: bash - run: | - echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" - echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" - echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: Checkout PyTorch uses: actions/checkout@v4 with: @@ -2703,7 +4648,7 @@ jobs: - uses: actions/upload-artifact@v4.4.0 if: always() with: - name: wheel-py3_11-cuda11_8 + name: wheel-py3_11-xpu retention-days: 14 if-no-files-found: error path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" @@ -2720,21 +4665,24 @@ jobs: if: always() run: | .github\scripts\kill_active_ssh_sessions.ps1 - wheel-py3_11-cuda11_8-test: # Testing +<<<<<<< HEAD +======= + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) + wheel-py3_11-xpu-test: # Testing if: ${{ github.repository_owner == 'pytorch' }} needs: - - wheel-py3_11-cuda11_8-build + - wheel-py3_11-xpu-build - get-label-type - runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge" + runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" timeout-minutes: 300 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu118 - GPU_ARCH_VERSION: 11.8 - GPU_ARCH_TYPE: cuda + DESIRED_CUDA: xpu + GPU_ARCH_TYPE: xpu SKIP_ALL_TESTS: 1 DESIRED_PYTHON: "3.11" steps: @@ -2753,7 +4701,11 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7 +======= + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -2782,6 +4734,20 @@ jobs: # Let's both exclude the path and disable Windows Defender completely just to be sure # that it doesn't interfere Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore +<<<<<<< HEAD +======= + - name: Checkout PyTorch + uses: actions/checkout@v4 + with: + submodules: recursive + path: pytorch + show-progress: false + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # NOTE: These environment variables are put here so that they can be applied on every job equally # They are also here because setting them at a workflow level doesn't give us access to the # runner.temp variable, which we need. @@ -2794,8 +4760,9 @@ jobs: - uses: actions/download-artifact@v4.1.7 name: Download Build Artifacts with: - name: wheel-py3_11-cuda11_8 + name: wheel-py3_11-xpu path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" +<<<<<<< HEAD - name: Checkout PyTorch uses: actions/checkout@v4 with: @@ -2807,6 +4774,8 @@ jobs: # Remove any artifacts from the previous checkouts git clean -fxd working-directory: pytorch +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: Populate binary env shell: bash run: | @@ -2828,26 +4797,25 @@ jobs: if: always() run: | .github\scripts\kill_active_ssh_sessions.ps1 - wheel-py3_11-cuda11_8-upload: # Uploading + wheel-py3_11-xpu-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} permissions: id-token: write contents: read - needs: wheel-py3_11-cuda11_8-test + needs: wheel-py3_11-xpu-test with: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu118 - GPU_ARCH_VERSION: 11.8 - GPU_ARCH_TYPE: cuda + DESIRED_CUDA: xpu + GPU_ARCH_TYPE: xpu DESIRED_PYTHON: "3.11" - build_name: wheel-py3_11-cuda11_8 + build_name: wheel-py3_11-xpu secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml - wheel-py3_11-cuda12_6-build: + wheel-py3_12-cpu-build: if: ${{ github.repository_owner == 'pytorch' }} needs: get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" @@ -2857,12 +4825,23 @@ jobs: PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu126 - GPU_ARCH_VERSION: 12.6 - GPU_ARCH_TYPE: cuda + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.11" + DESIRED_PYTHON: "3.12" steps: +<<<<<<< HEAD +======= + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: Display EC2 information shell: bash run: | @@ -2878,7 +4857,11 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7 +======= + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -2907,6 +4890,7 @@ jobs: # Let's both exclude the path and disable Windows Defender completely just to be sure # that it doesn't interfere Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore +<<<<<<< HEAD # NOTE: These environment variables are put here so that they can be applied on every job equally # They are also here because setting them at a workflow level doesn't give us access to the # runner.temp variable, which we need. @@ -2916,6 +4900,8 @@ jobs: echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: Checkout PyTorch uses: actions/checkout@v4 with: @@ -2938,7 +4924,7 @@ jobs: - uses: actions/upload-artifact@v4.4.0 if: always() with: - name: wheel-py3_11-cuda12_6 + name: wheel-py3_12-cpu retention-days: 14 if-no-files-found: error path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" @@ -2955,23 +4941,26 @@ jobs: if: always() run: | .github\scripts\kill_active_ssh_sessions.ps1 - wheel-py3_11-cuda12_6-test: # Testing +<<<<<<< HEAD +======= + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) + wheel-py3_12-cpu-test: # Testing if: ${{ github.repository_owner == 'pytorch' }} needs: - - wheel-py3_11-cuda12_6-build + - wheel-py3_12-cpu-build - get-label-type - runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge" + runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" timeout-minutes: 300 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu126 - GPU_ARCH_VERSION: 12.6 - GPU_ARCH_TYPE: cuda + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.11" + DESIRED_PYTHON: "3.12" steps: - name: Display EC2 information shell: bash @@ -2988,7 +4977,11 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7 +======= + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -3017,6 +5010,20 @@ jobs: # Let's both exclude the path and disable Windows Defender completely just to be sure # that it doesn't interfere Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore +<<<<<<< HEAD +======= + - name: Checkout PyTorch + uses: actions/checkout@v4 + with: + submodules: recursive + path: pytorch + show-progress: false + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # NOTE: These environment variables are put here so that they can be applied on every job equally # They are also here because setting them at a workflow level doesn't give us access to the # runner.temp variable, which we need. @@ -3029,8 +5036,9 @@ jobs: - uses: actions/download-artifact@v4.1.7 name: Download Build Artifacts with: - name: wheel-py3_11-cuda12_6 + name: wheel-py3_12-cpu path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" +<<<<<<< HEAD - name: Checkout PyTorch uses: actions/checkout@v4 with: @@ -3042,6 +5050,8 @@ jobs: # Remove any artifacts from the previous checkouts git clean -fxd working-directory: pytorch +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: Populate binary env shell: bash run: | @@ -3063,26 +5073,26 @@ jobs: if: always() run: | .github\scripts\kill_active_ssh_sessions.ps1 - wheel-py3_11-cuda12_6-upload: # Uploading + wheel-py3_12-cpu-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} permissions: id-token: write contents: read - needs: wheel-py3_11-cuda12_6-test + needs: wheel-py3_12-cpu-test with: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu126 - GPU_ARCH_VERSION: 12.6 - GPU_ARCH_TYPE: cuda - DESIRED_PYTHON: "3.11" - build_name: wheel-py3_11-cuda12_6 + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + DESIRED_PYTHON: "3.12" + build_name: wheel-py3_12-cpu secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml - wheel-py3_11-cuda12_8-build: +<<<<<<< HEAD + wheel-py3_12-cuda11_8-build: if: ${{ github.repository_owner == 'pytorch' }} needs: get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" @@ -3092,11 +5102,11 @@ jobs: PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu128 - GPU_ARCH_VERSION: 12.8 + DESIRED_CUDA: cu118 + GPU_ARCH_VERSION: 11.8 GPU_ARCH_TYPE: cuda SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.11" + DESIRED_PYTHON: "3.12" steps: - name: Display EC2 information shell: bash @@ -3173,7 +5183,7 @@ jobs: - uses: actions/upload-artifact@v4.4.0 if: always() with: - name: wheel-py3_11-cuda12_8 + name: wheel-py3_12-cuda11_8 retention-days: 14 if-no-files-found: error path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" @@ -3190,10 +5200,10 @@ jobs: if: always() run: | .github\scripts\kill_active_ssh_sessions.ps1 - wheel-py3_11-cuda12_8-test: # Testing + wheel-py3_12-cuda11_8-test: # Testing if: ${{ github.repository_owner == 'pytorch' }} needs: - - wheel-py3_11-cuda12_8-build + - wheel-py3_12-cuda11_8-build - get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge" timeout-minutes: 300 @@ -3202,11 +5212,11 @@ jobs: PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu128 - GPU_ARCH_VERSION: 12.8 + DESIRED_CUDA: cu118 + GPU_ARCH_VERSION: 11.8 GPU_ARCH_TYPE: cuda SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.11" + DESIRED_PYTHON: "3.12" steps: - name: Display EC2 information shell: bash @@ -3264,7 +5274,7 @@ jobs: - uses: actions/download-artifact@v4.1.7 name: Download Build Artifacts with: - name: wheel-py3_11-cuda12_8 + name: wheel-py3_12-cuda11_8 path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - name: Checkout PyTorch uses: actions/checkout@v4 @@ -3298,26 +5308,28 @@ jobs: if: always() run: | .github\scripts\kill_active_ssh_sessions.ps1 - wheel-py3_11-cuda12_8-upload: # Uploading + wheel-py3_12-cuda11_8-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} permissions: id-token: write contents: read - needs: wheel-py3_11-cuda12_8-test + needs: wheel-py3_12-cuda11_8-test with: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu128 - GPU_ARCH_VERSION: 12.8 + DESIRED_CUDA: cu118 + GPU_ARCH_VERSION: 11.8 GPU_ARCH_TYPE: cuda - DESIRED_PYTHON: "3.11" - build_name: wheel-py3_11-cuda12_8 + DESIRED_PYTHON: "3.12" + build_name: wheel-py3_12-cuda11_8 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml - wheel-py3_11-xpu-build: +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) + wheel-py3_12-cuda12_6-build: if: ${{ github.repository_owner == 'pytorch' }} needs: get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" @@ -3327,12 +5339,24 @@ jobs: PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: xpu - GPU_ARCH_TYPE: xpu + DESIRED_CUDA: cu126 + GPU_ARCH_VERSION: 12.6 + GPU_ARCH_TYPE: cuda SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.11" - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.0.4; platform_system == 'Linux' | intel-cmplr-lib-ur==2025.0.4; platform_system == 'Linux' | intel-cmplr-lic-rt==2025.0.4; platform_system == 'Linux' | intel-sycl-rt==2025.0.4; platform_system == 'Linux' | intel-cmplr-lib-rt==2025.0.5; platform_system == 'Windows' | intel-cmplr-lib-ur==2025.0.5; platform_system == 'Windows' | intel-cmplr-lic-rt==2025.0.5; platform_system == 'Windows' | intel-sycl-rt==2025.0.5; platform_system == 'Windows' | tcmlib==1.2.0 | umf==0.9.1 | intel-pti==0.10.1 + DESIRED_PYTHON: "3.12" steps: +<<<<<<< HEAD +======= + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: Display EC2 information shell: bash run: | @@ -3348,7 +5372,11 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7 +======= + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -3377,6 +5405,7 @@ jobs: # Let's both exclude the path and disable Windows Defender completely just to be sure # that it doesn't interfere Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore +<<<<<<< HEAD # NOTE: These environment variables are put here so that they can be applied on every job equally # They are also here because setting them at a workflow level doesn't give us access to the # runner.temp variable, which we need. @@ -3386,6 +5415,8 @@ jobs: echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: Checkout PyTorch uses: actions/checkout@v4 with: @@ -3408,7 +5439,7 @@ jobs: - uses: actions/upload-artifact@v4.4.0 if: always() with: - name: wheel-py3_11-xpu + name: wheel-py3_12-cuda12_6 retention-days: 14 if-no-files-found: error path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" @@ -3425,22 +5456,27 @@ jobs: if: always() run: | .github\scripts\kill_active_ssh_sessions.ps1 - wheel-py3_11-xpu-test: # Testing +<<<<<<< HEAD +======= + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) + wheel-py3_12-cuda12_6-test: # Testing if: ${{ github.repository_owner == 'pytorch' }} needs: - - wheel-py3_11-xpu-build + - wheel-py3_12-cuda12_6-build - get-label-type - runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" + runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge" timeout-minutes: 300 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: xpu - GPU_ARCH_TYPE: xpu + DESIRED_CUDA: cu126 + GPU_ARCH_VERSION: 12.6 + GPU_ARCH_TYPE: cuda SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.11" + DESIRED_PYTHON: "3.12" steps: - name: Display EC2 information shell: bash @@ -3457,7 +5493,11 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7 +======= + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -3486,6 +5526,20 @@ jobs: # Let's both exclude the path and disable Windows Defender completely just to be sure # that it doesn't interfere Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore +<<<<<<< HEAD +======= + - name: Checkout PyTorch + uses: actions/checkout@v4 + with: + submodules: recursive + path: pytorch + show-progress: false + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # NOTE: These environment variables are put here so that they can be applied on every job equally # They are also here because setting them at a workflow level doesn't give us access to the # runner.temp variable, which we need. @@ -3498,8 +5552,9 @@ jobs: - uses: actions/download-artifact@v4.1.7 name: Download Build Artifacts with: - name: wheel-py3_11-xpu + name: wheel-py3_12-cuda12_6 path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" +<<<<<<< HEAD - name: Checkout PyTorch uses: actions/checkout@v4 with: @@ -3511,6 +5566,8 @@ jobs: # Remove any artifacts from the previous checkouts git clean -fxd working-directory: pytorch +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: Populate binary env shell: bash run: | @@ -3532,25 +5589,26 @@ jobs: if: always() run: | .github\scripts\kill_active_ssh_sessions.ps1 - wheel-py3_11-xpu-upload: # Uploading + wheel-py3_12-cuda12_6-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} permissions: id-token: write contents: read - needs: wheel-py3_11-xpu-test + needs: wheel-py3_12-cuda12_6-test with: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: xpu - GPU_ARCH_TYPE: xpu - DESIRED_PYTHON: "3.11" - build_name: wheel-py3_11-xpu + DESIRED_CUDA: cu126 + GPU_ARCH_VERSION: 12.6 + GPU_ARCH_TYPE: cuda + DESIRED_PYTHON: "3.12" + build_name: wheel-py3_12-cuda12_6 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml - wheel-py3_12-cpu-build: + wheel-py3_12-cuda12_8-build: if: ${{ github.repository_owner == 'pytorch' }} needs: get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" @@ -3560,11 +5618,24 @@ jobs: PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cpu - GPU_ARCH_TYPE: cpu + DESIRED_CUDA: cu128 + GPU_ARCH_VERSION: 12.8 + GPU_ARCH_TYPE: cuda SKIP_ALL_TESTS: 1 DESIRED_PYTHON: "3.12" steps: +<<<<<<< HEAD +======= + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: Display EC2 information shell: bash run: | @@ -3580,7 +5651,11 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7 +======= + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -3609,6 +5684,7 @@ jobs: # Let's both exclude the path and disable Windows Defender completely just to be sure # that it doesn't interfere Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore +<<<<<<< HEAD # NOTE: These environment variables are put here so that they can be applied on every job equally # They are also here because setting them at a workflow level doesn't give us access to the # runner.temp variable, which we need. @@ -3618,6 +5694,8 @@ jobs: echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: Checkout PyTorch uses: actions/checkout@v4 with: @@ -3640,7 +5718,7 @@ jobs: - uses: actions/upload-artifact@v4.4.0 if: always() with: - name: wheel-py3_12-cpu + name: wheel-py3_12-cuda12_8 retention-days: 14 if-no-files-found: error path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" @@ -3657,20 +5735,25 @@ jobs: if: always() run: | .github\scripts\kill_active_ssh_sessions.ps1 - wheel-py3_12-cpu-test: # Testing +<<<<<<< HEAD +======= + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) + wheel-py3_12-cuda12_8-test: # Testing if: ${{ github.repository_owner == 'pytorch' }} needs: - - wheel-py3_12-cpu-build + - wheel-py3_12-cuda12_8-build - get-label-type - runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" + runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge" timeout-minutes: 300 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cpu - GPU_ARCH_TYPE: cpu + DESIRED_CUDA: cu128 + GPU_ARCH_VERSION: 12.8 + GPU_ARCH_TYPE: cuda SKIP_ALL_TESTS: 1 DESIRED_PYTHON: "3.12" steps: @@ -3689,7 +5772,11 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7 +======= + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -3718,6 +5805,20 @@ jobs: # Let's both exclude the path and disable Windows Defender completely just to be sure # that it doesn't interfere Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore +<<<<<<< HEAD +======= + - name: Checkout PyTorch + uses: actions/checkout@v4 + with: + submodules: recursive + path: pytorch + show-progress: false + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # NOTE: These environment variables are put here so that they can be applied on every job equally # They are also here because setting them at a workflow level doesn't give us access to the # runner.temp variable, which we need. @@ -3730,8 +5831,9 @@ jobs: - uses: actions/download-artifact@v4.1.7 name: Download Build Artifacts with: - name: wheel-py3_12-cpu + name: wheel-py3_12-cuda12_8 path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" +<<<<<<< HEAD - name: Checkout PyTorch uses: actions/checkout@v4 with: @@ -3743,6 +5845,8 @@ jobs: # Remove any artifacts from the previous checkouts git clean -fxd working-directory: pytorch +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: Populate binary env shell: bash run: | @@ -3764,25 +5868,30 @@ jobs: if: always() run: | .github\scripts\kill_active_ssh_sessions.ps1 - wheel-py3_12-cpu-upload: # Uploading + wheel-py3_12-cuda12_8-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} permissions: id-token: write contents: read - needs: wheel-py3_12-cpu-test + needs: wheel-py3_12-cuda12_8-test with: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cpu - GPU_ARCH_TYPE: cpu + DESIRED_CUDA: cu128 + GPU_ARCH_VERSION: 12.8 + GPU_ARCH_TYPE: cuda DESIRED_PYTHON: "3.12" - build_name: wheel-py3_12-cpu + build_name: wheel-py3_12-cuda12_8 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml - wheel-py3_12-cuda11_8-build: +<<<<<<< HEAD + wheel-py3_12-xpu-build: +======= + wheel-py3_12-cuda12_9-build: +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if: ${{ github.repository_owner == 'pytorch' }} needs: get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" @@ -3792,12 +5901,30 @@ jobs: PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu118 - GPU_ARCH_VERSION: 11.8 +<<<<<<< HEAD + DESIRED_CUDA: xpu + GPU_ARCH_TYPE: xpu + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.12" + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.0.4; platform_system == 'Linux' | intel-cmplr-lib-ur==2025.0.4; platform_system == 'Linux' | intel-cmplr-lic-rt==2025.0.4; platform_system == 'Linux' | intel-sycl-rt==2025.0.4; platform_system == 'Linux' | intel-cmplr-lib-rt==2025.0.5; platform_system == 'Windows' | intel-cmplr-lib-ur==2025.0.5; platform_system == 'Windows' | intel-cmplr-lic-rt==2025.0.5; platform_system == 'Windows' | intel-sycl-rt==2025.0.5; platform_system == 'Windows' | tcmlib==1.2.0 | umf==0.9.1 | intel-pti==0.10.1 + steps: +======= + DESIRED_CUDA: cu129 + GPU_ARCH_VERSION: 12.9 GPU_ARCH_TYPE: cuda SKIP_ALL_TESTS: 1 DESIRED_PYTHON: "3.12" steps: + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: Display EC2 information shell: bash run: | @@ -3813,7 +5940,11 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7 +======= + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -3842,15 +5973,8 @@ jobs: # Let's both exclude the path and disable Windows Defender completely just to be sure # that it doesn't interfere Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore - # NOTE: These environment variables are put here so that they can be applied on every job equally - # They are also here because setting them at a workflow level doesn't give us access to the - # runner.temp variable, which we need. - - name: Populate binary env - shell: bash - run: | - echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" - echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" - echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" +<<<<<<< HEAD +======= - name: Checkout PyTorch uses: actions/checkout@v4 with: @@ -3873,7 +5997,7 @@ jobs: - uses: actions/upload-artifact@v4.4.0 if: always() with: - name: wheel-py3_12-cuda11_8 + name: wheel-py3_12-cuda12_9 retention-days: 14 if-no-files-found: error path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" @@ -3890,10 +6014,11 @@ jobs: if: always() run: | .github\scripts\kill_active_ssh_sessions.ps1 - wheel-py3_12-cuda11_8-test: # Testing + + wheel-py3_12-cuda12_9-test: # Testing if: ${{ github.repository_owner == 'pytorch' }} needs: - - wheel-py3_12-cuda11_8-build + - wheel-py3_12-cuda12_9-build - get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge" timeout-minutes: 300 @@ -3902,8 +6027,8 @@ jobs: PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu118 - GPU_ARCH_VERSION: 11.8 + DESIRED_CUDA: cu129 + GPU_ARCH_VERSION: 12.9 GPU_ARCH_TYPE: cuda SKIP_ALL_TESTS: 1 DESIRED_PYTHON: "3.12" @@ -3923,7 +6048,7 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7 + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8 continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -3952,6 +6077,18 @@ jobs: # Let's both exclude the path and disable Windows Defender completely just to be sure # that it doesn't interfere Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore + - name: Checkout PyTorch + uses: actions/checkout@v4 + with: + submodules: recursive + path: pytorch + show-progress: false + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # NOTE: These environment variables are put here so that they can be applied on every job equally # They are also here because setting them at a workflow level doesn't give us access to the # runner.temp variable, which we need. @@ -3961,22 +6098,13 @@ jobs: echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" +<<<<<<< HEAD +======= - uses: actions/download-artifact@v4.1.7 name: Download Build Artifacts with: - name: wheel-py3_12-cuda11_8 + name: wheel-py3_12-cuda12_9 path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - - name: Checkout PyTorch - uses: actions/checkout@v4 - with: - submodules: recursive - path: pytorch - show-progress: false - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: pytorch - name: Populate binary env shell: bash run: | @@ -3998,26 +6126,26 @@ jobs: if: always() run: | .github\scripts\kill_active_ssh_sessions.ps1 - wheel-py3_12-cuda11_8-upload: # Uploading + wheel-py3_12-cuda12_9-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} permissions: id-token: write contents: read - needs: wheel-py3_12-cuda11_8-test + needs: wheel-py3_12-cuda12_9-test with: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu118 - GPU_ARCH_VERSION: 11.8 + DESIRED_CUDA: cu129 + GPU_ARCH_VERSION: 12.9 GPU_ARCH_TYPE: cuda DESIRED_PYTHON: "3.12" - build_name: wheel-py3_12-cuda11_8 + build_name: wheel-py3_12-cuda12_9 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml - wheel-py3_12-cuda12_6-build: + wheel-py3_12-xpu-build: if: ${{ github.repository_owner == 'pytorch' }} needs: get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" @@ -4027,12 +6155,21 @@ jobs: PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu126 - GPU_ARCH_VERSION: 12.6 - GPU_ARCH_TYPE: cuda + DESIRED_CUDA: xpu + GPU_ARCH_TYPE: xpu SKIP_ALL_TESTS: 1 DESIRED_PYTHON: "3.12" + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.1.1 | intel-cmplr-lib-ur==2025.1.1 | intel-cmplr-lic-rt==2025.1.1 | intel-sycl-rt==2025.1.1 | oneccl-devel==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.15.0; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.1.0 | onemkl-sycl-dft==2025.1.0 | onemkl-sycl-lapack==2025.1.0 | onemkl-sycl-rng==2025.1.0 | onemkl-sycl-sparse==2025.1.0 | dpcpp-cpp-rt==2025.1.1 | intel-opencl-rt==2025.1.1 | mkl==2025.1.0 | intel-openmp==2025.1.1 | tbb==2022.1.0 | tcmlib==1.3.0 | umf==0.10.0 | intel-pti==0.12.3 steps: + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" - name: Display EC2 information shell: bash run: | @@ -4048,7 +6185,7 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7 + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8 continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -4077,15 +6214,7 @@ jobs: # Let's both exclude the path and disable Windows Defender completely just to be sure # that it doesn't interfere Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore - # NOTE: These environment variables are put here so that they can be applied on every job equally - # They are also here because setting them at a workflow level doesn't give us access to the - # runner.temp variable, which we need. - - name: Populate binary env - shell: bash - run: | - echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" - echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" - echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: Checkout PyTorch uses: actions/checkout@v4 with: @@ -4108,7 +6237,7 @@ jobs: - uses: actions/upload-artifact@v4.4.0 if: always() with: - name: wheel-py3_12-cuda12_6 + name: wheel-py3_12-xpu retention-days: 14 if-no-files-found: error path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" @@ -4125,21 +6254,24 @@ jobs: if: always() run: | .github\scripts\kill_active_ssh_sessions.ps1 - wheel-py3_12-cuda12_6-test: # Testing +<<<<<<< HEAD +======= + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) + wheel-py3_12-xpu-test: # Testing if: ${{ github.repository_owner == 'pytorch' }} needs: - - wheel-py3_12-cuda12_6-build + - wheel-py3_12-xpu-build - get-label-type - runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge" + runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" timeout-minutes: 300 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu126 - GPU_ARCH_VERSION: 12.6 - GPU_ARCH_TYPE: cuda + DESIRED_CUDA: xpu + GPU_ARCH_TYPE: xpu SKIP_ALL_TESTS: 1 DESIRED_PYTHON: "3.12" steps: @@ -4158,7 +6290,11 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7 +======= + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -4187,6 +6323,20 @@ jobs: # Let's both exclude the path and disable Windows Defender completely just to be sure # that it doesn't interfere Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore +<<<<<<< HEAD +======= + - name: Checkout PyTorch + uses: actions/checkout@v4 + with: + submodules: recursive + path: pytorch + show-progress: false + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # NOTE: These environment variables are put here so that they can be applied on every job equally # They are also here because setting them at a workflow level doesn't give us access to the # runner.temp variable, which we need. @@ -4199,8 +6349,9 @@ jobs: - uses: actions/download-artifact@v4.1.7 name: Download Build Artifacts with: - name: wheel-py3_12-cuda12_6 + name: wheel-py3_12-xpu path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" +<<<<<<< HEAD - name: Checkout PyTorch uses: actions/checkout@v4 with: @@ -4212,6 +6363,8 @@ jobs: # Remove any artifacts from the previous checkouts git clean -fxd working-directory: pytorch +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: Populate binary env shell: bash run: | @@ -4233,26 +6386,25 @@ jobs: if: always() run: | .github\scripts\kill_active_ssh_sessions.ps1 - wheel-py3_12-cuda12_6-upload: # Uploading + wheel-py3_12-xpu-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} permissions: id-token: write contents: read - needs: wheel-py3_12-cuda12_6-test + needs: wheel-py3_12-xpu-test with: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu126 - GPU_ARCH_VERSION: 12.6 - GPU_ARCH_TYPE: cuda + DESIRED_CUDA: xpu + GPU_ARCH_TYPE: xpu DESIRED_PYTHON: "3.12" - build_name: wheel-py3_12-cuda12_6 + build_name: wheel-py3_12-xpu secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml - wheel-py3_12-cuda12_8-build: + wheel-py3_13-cpu-build: if: ${{ github.repository_owner == 'pytorch' }} needs: get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" @@ -4262,12 +6414,23 @@ jobs: PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu128 - GPU_ARCH_VERSION: 12.8 - GPU_ARCH_TYPE: cuda + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.12" + DESIRED_PYTHON: "3.13" steps: +<<<<<<< HEAD +======= + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: Display EC2 information shell: bash run: | @@ -4283,7 +6446,11 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7 +======= + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -4312,6 +6479,7 @@ jobs: # Let's both exclude the path and disable Windows Defender completely just to be sure # that it doesn't interfere Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore +<<<<<<< HEAD # NOTE: These environment variables are put here so that they can be applied on every job equally # They are also here because setting them at a workflow level doesn't give us access to the # runner.temp variable, which we need. @@ -4321,6 +6489,8 @@ jobs: echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: Checkout PyTorch uses: actions/checkout@v4 with: @@ -4343,7 +6513,7 @@ jobs: - uses: actions/upload-artifact@v4.4.0 if: always() with: - name: wheel-py3_12-cuda12_8 + name: wheel-py3_13-cpu retention-days: 14 if-no-files-found: error path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" @@ -4360,23 +6530,26 @@ jobs: if: always() run: | .github\scripts\kill_active_ssh_sessions.ps1 - wheel-py3_12-cuda12_8-test: # Testing +<<<<<<< HEAD +======= + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) + wheel-py3_13-cpu-test: # Testing if: ${{ github.repository_owner == 'pytorch' }} needs: - - wheel-py3_12-cuda12_8-build + - wheel-py3_13-cpu-build - get-label-type - runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge" + runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" timeout-minutes: 300 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu128 - GPU_ARCH_VERSION: 12.8 - GPU_ARCH_TYPE: cuda + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.12" + DESIRED_PYTHON: "3.13" steps: - name: Display EC2 information shell: bash @@ -4393,7 +6566,11 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7 +======= + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -4422,6 +6599,20 @@ jobs: # Let's both exclude the path and disable Windows Defender completely just to be sure # that it doesn't interfere Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore +<<<<<<< HEAD +======= + - name: Checkout PyTorch + uses: actions/checkout@v4 + with: + submodules: recursive + path: pytorch + show-progress: false + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # NOTE: These environment variables are put here so that they can be applied on every job equally # They are also here because setting them at a workflow level doesn't give us access to the # runner.temp variable, which we need. @@ -4434,8 +6625,9 @@ jobs: - uses: actions/download-artifact@v4.1.7 name: Download Build Artifacts with: - name: wheel-py3_12-cuda12_8 + name: wheel-py3_13-cpu path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" +<<<<<<< HEAD - name: Checkout PyTorch uses: actions/checkout@v4 with: @@ -4447,6 +6639,8 @@ jobs: # Remove any artifacts from the previous checkouts git clean -fxd working-directory: pytorch +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: Populate binary env shell: bash run: | @@ -4468,26 +6662,26 @@ jobs: if: always() run: | .github\scripts\kill_active_ssh_sessions.ps1 - wheel-py3_12-cuda12_8-upload: # Uploading + wheel-py3_13-cpu-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} permissions: id-token: write contents: read - needs: wheel-py3_12-cuda12_8-test + needs: wheel-py3_13-cpu-test with: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu128 - GPU_ARCH_VERSION: 12.8 - GPU_ARCH_TYPE: cuda - DESIRED_PYTHON: "3.12" - build_name: wheel-py3_12-cuda12_8 + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + DESIRED_PYTHON: "3.13" + build_name: wheel-py3_13-cpu secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml - wheel-py3_12-xpu-build: +<<<<<<< HEAD + wheel-py3_13-cuda11_8-build: if: ${{ github.repository_owner == 'pytorch' }} needs: get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" @@ -4497,11 +6691,11 @@ jobs: PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: xpu - GPU_ARCH_TYPE: xpu + DESIRED_CUDA: cu118 + GPU_ARCH_VERSION: 11.8 + GPU_ARCH_TYPE: cuda SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.12" - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.0.4; platform_system == 'Linux' | intel-cmplr-lib-ur==2025.0.4; platform_system == 'Linux' | intel-cmplr-lic-rt==2025.0.4; platform_system == 'Linux' | intel-sycl-rt==2025.0.4; platform_system == 'Linux' | intel-cmplr-lib-rt==2025.0.5; platform_system == 'Windows' | intel-cmplr-lib-ur==2025.0.5; platform_system == 'Windows' | intel-cmplr-lic-rt==2025.0.5; platform_system == 'Windows' | intel-sycl-rt==2025.0.5; platform_system == 'Windows' | tcmlib==1.2.0 | umf==0.9.1 | intel-pti==0.10.1 + DESIRED_PYTHON: "3.13" steps: - name: Display EC2 information shell: bash @@ -4578,7 +6772,7 @@ jobs: - uses: actions/upload-artifact@v4.4.0 if: always() with: - name: wheel-py3_12-xpu + name: wheel-py3_13-cuda11_8 retention-days: 14 if-no-files-found: error path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" @@ -4595,22 +6789,23 @@ jobs: if: always() run: | .github\scripts\kill_active_ssh_sessions.ps1 - wheel-py3_12-xpu-test: # Testing + wheel-py3_13-cuda11_8-test: # Testing if: ${{ github.repository_owner == 'pytorch' }} needs: - - wheel-py3_12-xpu-build + - wheel-py3_13-cuda11_8-build - get-label-type - runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" + runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge" timeout-minutes: 300 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: xpu - GPU_ARCH_TYPE: xpu + DESIRED_CUDA: cu118 + GPU_ARCH_VERSION: 11.8 + GPU_ARCH_TYPE: cuda SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.12" + DESIRED_PYTHON: "3.13" steps: - name: Display EC2 information shell: bash @@ -4668,7 +6863,7 @@ jobs: - uses: actions/download-artifact@v4.1.7 name: Download Build Artifacts with: - name: wheel-py3_12-xpu + name: wheel-py3_13-cuda11_8 path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - name: Checkout PyTorch uses: actions/checkout@v4 @@ -4702,25 +6897,28 @@ jobs: if: always() run: | .github\scripts\kill_active_ssh_sessions.ps1 - wheel-py3_12-xpu-upload: # Uploading + wheel-py3_13-cuda11_8-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} permissions: id-token: write contents: read - needs: wheel-py3_12-xpu-test + needs: wheel-py3_13-cuda11_8-test with: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: xpu - GPU_ARCH_TYPE: xpu - DESIRED_PYTHON: "3.12" - build_name: wheel-py3_12-xpu + DESIRED_CUDA: cu118 + GPU_ARCH_VERSION: 11.8 + GPU_ARCH_TYPE: cuda + DESIRED_PYTHON: "3.13" + build_name: wheel-py3_13-cuda11_8 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml - wheel-py3_13-cpu-build: +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) + wheel-py3_13-cuda12_6-build: if: ${{ github.repository_owner == 'pytorch' }} needs: get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" @@ -4730,11 +6928,24 @@ jobs: PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cpu - GPU_ARCH_TYPE: cpu + DESIRED_CUDA: cu126 + GPU_ARCH_VERSION: 12.6 + GPU_ARCH_TYPE: cuda SKIP_ALL_TESTS: 1 DESIRED_PYTHON: "3.13" steps: +<<<<<<< HEAD +======= + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: Display EC2 information shell: bash run: | @@ -4750,7 +6961,11 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7 +======= + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -4779,6 +6994,7 @@ jobs: # Let's both exclude the path and disable Windows Defender completely just to be sure # that it doesn't interfere Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore +<<<<<<< HEAD # NOTE: These environment variables are put here so that they can be applied on every job equally # They are also here because setting them at a workflow level doesn't give us access to the # runner.temp variable, which we need. @@ -4788,6 +7004,8 @@ jobs: echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: Checkout PyTorch uses: actions/checkout@v4 with: @@ -4810,7 +7028,7 @@ jobs: - uses: actions/upload-artifact@v4.4.0 if: always() with: - name: wheel-py3_13-cpu + name: wheel-py3_13-cuda12_6 retention-days: 14 if-no-files-found: error path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" @@ -4827,20 +7045,25 @@ jobs: if: always() run: | .github\scripts\kill_active_ssh_sessions.ps1 - wheel-py3_13-cpu-test: # Testing +<<<<<<< HEAD +======= + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) + wheel-py3_13-cuda12_6-test: # Testing if: ${{ github.repository_owner == 'pytorch' }} needs: - - wheel-py3_13-cpu-build + - wheel-py3_13-cuda12_6-build - get-label-type - runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" + runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge" timeout-minutes: 300 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cpu - GPU_ARCH_TYPE: cpu + DESIRED_CUDA: cu126 + GPU_ARCH_VERSION: 12.6 + GPU_ARCH_TYPE: cuda SKIP_ALL_TESTS: 1 DESIRED_PYTHON: "3.13" steps: @@ -4859,7 +7082,11 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7 +======= + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -4888,6 +7115,20 @@ jobs: # Let's both exclude the path and disable Windows Defender completely just to be sure # that it doesn't interfere Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore +<<<<<<< HEAD +======= + - name: Checkout PyTorch + uses: actions/checkout@v4 + with: + submodules: recursive + path: pytorch + show-progress: false + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # NOTE: These environment variables are put here so that they can be applied on every job equally # They are also here because setting them at a workflow level doesn't give us access to the # runner.temp variable, which we need. @@ -4900,8 +7141,9 @@ jobs: - uses: actions/download-artifact@v4.1.7 name: Download Build Artifacts with: - name: wheel-py3_13-cpu + name: wheel-py3_13-cuda12_6 path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" +<<<<<<< HEAD - name: Checkout PyTorch uses: actions/checkout@v4 with: @@ -4913,6 +7155,8 @@ jobs: # Remove any artifacts from the previous checkouts git clean -fxd working-directory: pytorch +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: Populate binary env shell: bash run: | @@ -4934,25 +7178,26 @@ jobs: if: always() run: | .github\scripts\kill_active_ssh_sessions.ps1 - wheel-py3_13-cpu-upload: # Uploading + wheel-py3_13-cuda12_6-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} permissions: id-token: write contents: read - needs: wheel-py3_13-cpu-test + needs: wheel-py3_13-cuda12_6-test with: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cpu - GPU_ARCH_TYPE: cpu + DESIRED_CUDA: cu126 + GPU_ARCH_VERSION: 12.6 + GPU_ARCH_TYPE: cuda DESIRED_PYTHON: "3.13" - build_name: wheel-py3_13-cpu + build_name: wheel-py3_13-cuda12_6 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml - wheel-py3_13-cuda11_8-build: + wheel-py3_13-cuda12_8-build: if: ${{ github.repository_owner == 'pytorch' }} needs: get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" @@ -4962,12 +7207,24 @@ jobs: PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu118 - GPU_ARCH_VERSION: 11.8 + DESIRED_CUDA: cu128 + GPU_ARCH_VERSION: 12.8 GPU_ARCH_TYPE: cuda SKIP_ALL_TESTS: 1 DESIRED_PYTHON: "3.13" steps: +<<<<<<< HEAD +======= + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: Display EC2 information shell: bash run: | @@ -4983,7 +7240,11 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7 +======= + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -5012,6 +7273,7 @@ jobs: # Let's both exclude the path and disable Windows Defender completely just to be sure # that it doesn't interfere Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore +<<<<<<< HEAD # NOTE: These environment variables are put here so that they can be applied on every job equally # They are also here because setting them at a workflow level doesn't give us access to the # runner.temp variable, which we need. @@ -5021,6 +7283,8 @@ jobs: echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: Checkout PyTorch uses: actions/checkout@v4 with: @@ -5043,7 +7307,7 @@ jobs: - uses: actions/upload-artifact@v4.4.0 if: always() with: - name: wheel-py3_13-cuda11_8 + name: wheel-py3_13-cuda12_8 retention-days: 14 if-no-files-found: error path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" @@ -5060,10 +7324,14 @@ jobs: if: always() run: | .github\scripts\kill_active_ssh_sessions.ps1 - wheel-py3_13-cuda11_8-test: # Testing +<<<<<<< HEAD +======= + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) + wheel-py3_13-cuda12_8-test: # Testing if: ${{ github.repository_owner == 'pytorch' }} needs: - - wheel-py3_13-cuda11_8-build + - wheel-py3_13-cuda12_8-build - get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge" timeout-minutes: 300 @@ -5072,8 +7340,8 @@ jobs: PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu118 - GPU_ARCH_VERSION: 11.8 + DESIRED_CUDA: cu128 + GPU_ARCH_VERSION: 12.8 GPU_ARCH_TYPE: cuda SKIP_ALL_TESTS: 1 DESIRED_PYTHON: "3.13" @@ -5093,7 +7361,11 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7 +======= + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -5122,6 +7394,20 @@ jobs: # Let's both exclude the path and disable Windows Defender completely just to be sure # that it doesn't interfere Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore +<<<<<<< HEAD +======= + - name: Checkout PyTorch + uses: actions/checkout@v4 + with: + submodules: recursive + path: pytorch + show-progress: false + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # NOTE: These environment variables are put here so that they can be applied on every job equally # They are also here because setting them at a workflow level doesn't give us access to the # runner.temp variable, which we need. @@ -5134,8 +7420,9 @@ jobs: - uses: actions/download-artifact@v4.1.7 name: Download Build Artifacts with: - name: wheel-py3_13-cuda11_8 + name: wheel-py3_13-cuda12_8 path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" +<<<<<<< HEAD - name: Checkout PyTorch uses: actions/checkout@v4 with: @@ -5147,6 +7434,8 @@ jobs: # Remove any artifacts from the previous checkouts git clean -fxd working-directory: pytorch +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: Populate binary env shell: bash run: | @@ -5168,26 +7457,30 @@ jobs: if: always() run: | .github\scripts\kill_active_ssh_sessions.ps1 - wheel-py3_13-cuda11_8-upload: # Uploading + wheel-py3_13-cuda12_8-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} permissions: id-token: write contents: read - needs: wheel-py3_13-cuda11_8-test + needs: wheel-py3_13-cuda12_8-test with: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu118 - GPU_ARCH_VERSION: 11.8 + DESIRED_CUDA: cu128 + GPU_ARCH_VERSION: 12.8 GPU_ARCH_TYPE: cuda DESIRED_PYTHON: "3.13" - build_name: wheel-py3_13-cuda11_8 + build_name: wheel-py3_13-cuda12_8 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml - wheel-py3_13-cuda12_6-build: +<<<<<<< HEAD + wheel-py3_13-xpu-build: +======= + wheel-py3_13-cuda12_9-build: +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if: ${{ github.repository_owner == 'pytorch' }} needs: get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" @@ -5197,12 +7490,30 @@ jobs: PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu126 - GPU_ARCH_VERSION: 12.6 +<<<<<<< HEAD + DESIRED_CUDA: xpu + GPU_ARCH_TYPE: xpu + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.13" + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.0.4; platform_system == 'Linux' | intel-cmplr-lib-ur==2025.0.4; platform_system == 'Linux' | intel-cmplr-lic-rt==2025.0.4; platform_system == 'Linux' | intel-sycl-rt==2025.0.4; platform_system == 'Linux' | intel-cmplr-lib-rt==2025.0.5; platform_system == 'Windows' | intel-cmplr-lib-ur==2025.0.5; platform_system == 'Windows' | intel-cmplr-lic-rt==2025.0.5; platform_system == 'Windows' | intel-sycl-rt==2025.0.5; platform_system == 'Windows' | tcmlib==1.2.0 | umf==0.9.1 | intel-pti==0.10.1 + steps: +======= + DESIRED_CUDA: cu129 + GPU_ARCH_VERSION: 12.9 GPU_ARCH_TYPE: cuda SKIP_ALL_TESTS: 1 DESIRED_PYTHON: "3.13" steps: + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: Display EC2 information shell: bash run: | @@ -5218,7 +7529,11 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7 +======= + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -5247,15 +7562,8 @@ jobs: # Let's both exclude the path and disable Windows Defender completely just to be sure # that it doesn't interfere Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore - # NOTE: These environment variables are put here so that they can be applied on every job equally - # They are also here because setting them at a workflow level doesn't give us access to the - # runner.temp variable, which we need. - - name: Populate binary env - shell: bash - run: | - echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" - echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" - echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" +<<<<<<< HEAD +======= - name: Checkout PyTorch uses: actions/checkout@v4 with: @@ -5278,7 +7586,7 @@ jobs: - uses: actions/upload-artifact@v4.4.0 if: always() with: - name: wheel-py3_13-cuda12_6 + name: wheel-py3_13-cuda12_9 retention-days: 14 if-no-files-found: error path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" @@ -5295,10 +7603,11 @@ jobs: if: always() run: | .github\scripts\kill_active_ssh_sessions.ps1 - wheel-py3_13-cuda12_6-test: # Testing + + wheel-py3_13-cuda12_9-test: # Testing if: ${{ github.repository_owner == 'pytorch' }} needs: - - wheel-py3_13-cuda12_6-build + - wheel-py3_13-cuda12_9-build - get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge" timeout-minutes: 300 @@ -5307,8 +7616,8 @@ jobs: PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu126 - GPU_ARCH_VERSION: 12.6 + DESIRED_CUDA: cu129 + GPU_ARCH_VERSION: 12.9 GPU_ARCH_TYPE: cuda SKIP_ALL_TESTS: 1 DESIRED_PYTHON: "3.13" @@ -5328,7 +7637,7 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7 + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8 continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -5357,6 +7666,18 @@ jobs: # Let's both exclude the path and disable Windows Defender completely just to be sure # that it doesn't interfere Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore + - name: Checkout PyTorch + uses: actions/checkout@v4 + with: + submodules: recursive + path: pytorch + show-progress: false + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # NOTE: These environment variables are put here so that they can be applied on every job equally # They are also here because setting them at a workflow level doesn't give us access to the # runner.temp variable, which we need. @@ -5366,22 +7687,13 @@ jobs: echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" +<<<<<<< HEAD +======= - uses: actions/download-artifact@v4.1.7 name: Download Build Artifacts with: - name: wheel-py3_13-cuda12_6 + name: wheel-py3_13-cuda12_9 path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - - name: Checkout PyTorch - uses: actions/checkout@v4 - with: - submodules: recursive - path: pytorch - show-progress: false - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: pytorch - name: Populate binary env shell: bash run: | @@ -5403,26 +7715,26 @@ jobs: if: always() run: | .github\scripts\kill_active_ssh_sessions.ps1 - wheel-py3_13-cuda12_6-upload: # Uploading + wheel-py3_13-cuda12_9-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} permissions: id-token: write contents: read - needs: wheel-py3_13-cuda12_6-test + needs: wheel-py3_13-cuda12_9-test with: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu126 - GPU_ARCH_VERSION: 12.6 + DESIRED_CUDA: cu129 + GPU_ARCH_VERSION: 12.9 GPU_ARCH_TYPE: cuda DESIRED_PYTHON: "3.13" - build_name: wheel-py3_13-cuda12_6 + build_name: wheel-py3_13-cuda12_9 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml - wheel-py3_13-cuda12_8-build: + wheel-py3_13-xpu-build: if: ${{ github.repository_owner == 'pytorch' }} needs: get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" @@ -5432,12 +7744,21 @@ jobs: PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu128 - GPU_ARCH_VERSION: 12.8 - GPU_ARCH_TYPE: cuda + DESIRED_CUDA: xpu + GPU_ARCH_TYPE: xpu SKIP_ALL_TESTS: 1 DESIRED_PYTHON: "3.13" + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.1.1 | intel-cmplr-lib-ur==2025.1.1 | intel-cmplr-lic-rt==2025.1.1 | intel-sycl-rt==2025.1.1 | oneccl-devel==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.15.0; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.1.0 | onemkl-sycl-dft==2025.1.0 | onemkl-sycl-lapack==2025.1.0 | onemkl-sycl-rng==2025.1.0 | onemkl-sycl-sparse==2025.1.0 | dpcpp-cpp-rt==2025.1.1 | intel-opencl-rt==2025.1.1 | mkl==2025.1.0 | intel-openmp==2025.1.1 | tbb==2022.1.0 | tcmlib==1.3.0 | umf==0.10.0 | intel-pti==0.12.3 steps: + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" - name: Display EC2 information shell: bash run: | @@ -5453,7 +7774,7 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7 + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8 continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -5482,15 +7803,7 @@ jobs: # Let's both exclude the path and disable Windows Defender completely just to be sure # that it doesn't interfere Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore - # NOTE: These environment variables are put here so that they can be applied on every job equally - # They are also here because setting them at a workflow level doesn't give us access to the - # runner.temp variable, which we need. - - name: Populate binary env - shell: bash - run: | - echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" - echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" - echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: Checkout PyTorch uses: actions/checkout@v4 with: @@ -5513,7 +7826,7 @@ jobs: - uses: actions/upload-artifact@v4.4.0 if: always() with: - name: wheel-py3_13-cuda12_8 + name: wheel-py3_13-xpu retention-days: 14 if-no-files-found: error path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" @@ -5530,21 +7843,24 @@ jobs: if: always() run: | .github\scripts\kill_active_ssh_sessions.ps1 - wheel-py3_13-cuda12_8-test: # Testing +<<<<<<< HEAD +======= + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) + wheel-py3_13-xpu-test: # Testing if: ${{ github.repository_owner == 'pytorch' }} needs: - - wheel-py3_13-cuda12_8-build + - wheel-py3_13-xpu-build - get-label-type - runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge" + runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" timeout-minutes: 300 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu128 - GPU_ARCH_VERSION: 12.8 - GPU_ARCH_TYPE: cuda + DESIRED_CUDA: xpu + GPU_ARCH_TYPE: xpu SKIP_ALL_TESTS: 1 DESIRED_PYTHON: "3.13" steps: @@ -5563,7 +7879,11 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7 +======= + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -5592,6 +7912,20 @@ jobs: # Let's both exclude the path and disable Windows Defender completely just to be sure # that it doesn't interfere Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore +<<<<<<< HEAD +======= + - name: Checkout PyTorch + uses: actions/checkout@v4 + with: + submodules: recursive + path: pytorch + show-progress: false + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # NOTE: These environment variables are put here so that they can be applied on every job equally # They are also here because setting them at a workflow level doesn't give us access to the # runner.temp variable, which we need. @@ -5604,8 +7938,9 @@ jobs: - uses: actions/download-artifact@v4.1.7 name: Download Build Artifacts with: - name: wheel-py3_13-cuda12_8 + name: wheel-py3_13-xpu path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" +<<<<<<< HEAD - name: Checkout PyTorch uses: actions/checkout@v4 with: @@ -5617,6 +7952,8 @@ jobs: # Remove any artifacts from the previous checkouts git clean -fxd working-directory: pytorch +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: Populate binary env shell: bash run: | @@ -5638,26 +7975,25 @@ jobs: if: always() run: | .github\scripts\kill_active_ssh_sessions.ps1 - wheel-py3_13-cuda12_8-upload: # Uploading + wheel-py3_13-xpu-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} permissions: id-token: write contents: read - needs: wheel-py3_13-cuda12_8-test + needs: wheel-py3_13-xpu-test with: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu128 - GPU_ARCH_VERSION: 12.8 - GPU_ARCH_TYPE: cuda + DESIRED_CUDA: xpu + GPU_ARCH_TYPE: xpu DESIRED_PYTHON: "3.13" - build_name: wheel-py3_13-cuda12_8 + build_name: wheel-py3_13-xpu secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml - wheel-py3_13-xpu-build: + wheel-py3_13t-cpu-build: if: ${{ github.repository_owner == 'pytorch' }} needs: get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" @@ -5667,12 +8003,23 @@ jobs: PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: xpu - GPU_ARCH_TYPE: xpu + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.13" - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.0.4; platform_system == 'Linux' | intel-cmplr-lib-ur==2025.0.4; platform_system == 'Linux' | intel-cmplr-lic-rt==2025.0.4; platform_system == 'Linux' | intel-sycl-rt==2025.0.4; platform_system == 'Linux' | intel-cmplr-lib-rt==2025.0.5; platform_system == 'Windows' | intel-cmplr-lib-ur==2025.0.5; platform_system == 'Windows' | intel-cmplr-lic-rt==2025.0.5; platform_system == 'Windows' | intel-sycl-rt==2025.0.5; platform_system == 'Windows' | tcmlib==1.2.0 | umf==0.9.1 | intel-pti==0.10.1 + DESIRED_PYTHON: "3.13t" steps: +<<<<<<< HEAD +======= + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: Display EC2 information shell: bash run: | @@ -5688,7 +8035,11 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7 +======= + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -5717,6 +8068,7 @@ jobs: # Let's both exclude the path and disable Windows Defender completely just to be sure # that it doesn't interfere Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore +<<<<<<< HEAD # NOTE: These environment variables are put here so that they can be applied on every job equally # They are also here because setting them at a workflow level doesn't give us access to the # runner.temp variable, which we need. @@ -5726,6 +8078,8 @@ jobs: echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: Checkout PyTorch uses: actions/checkout@v4 with: @@ -5748,7 +8102,7 @@ jobs: - uses: actions/upload-artifact@v4.4.0 if: always() with: - name: wheel-py3_13-xpu + name: wheel-py3_13t-cpu retention-days: 14 if-no-files-found: error path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" @@ -5765,10 +8119,14 @@ jobs: if: always() run: | .github\scripts\kill_active_ssh_sessions.ps1 - wheel-py3_13-xpu-test: # Testing +<<<<<<< HEAD +======= + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) + wheel-py3_13t-cpu-test: # Testing if: ${{ github.repository_owner == 'pytorch' }} needs: - - wheel-py3_13-xpu-build + - wheel-py3_13t-cpu-build - get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" timeout-minutes: 300 @@ -5776,11 +8134,11 @@ jobs: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: xpu - GPU_ARCH_TYPE: xpu + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.13" + DESIRED_PYTHON: "3.13t" steps: - name: Display EC2 information shell: bash @@ -5797,7 +8155,11 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7 +======= + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -5826,6 +8188,20 @@ jobs: # Let's both exclude the path and disable Windows Defender completely just to be sure # that it doesn't interfere Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore +<<<<<<< HEAD +======= + - name: Checkout PyTorch + uses: actions/checkout@v4 + with: + submodules: recursive + path: pytorch + show-progress: false + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # NOTE: These environment variables are put here so that they can be applied on every job equally # They are also here because setting them at a workflow level doesn't give us access to the # runner.temp variable, which we need. @@ -5838,8 +8214,9 @@ jobs: - uses: actions/download-artifact@v4.1.7 name: Download Build Artifacts with: - name: wheel-py3_13-xpu + name: wheel-py3_13t-cpu path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" +<<<<<<< HEAD - name: Checkout PyTorch uses: actions/checkout@v4 with: @@ -5851,6 +8228,8 @@ jobs: # Remove any artifacts from the previous checkouts git clean -fxd working-directory: pytorch +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: Populate binary env shell: bash run: | @@ -5872,25 +8251,26 @@ jobs: if: always() run: | .github\scripts\kill_active_ssh_sessions.ps1 - wheel-py3_13-xpu-upload: # Uploading + wheel-py3_13t-cpu-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} permissions: id-token: write contents: read - needs: wheel-py3_13-xpu-test + needs: wheel-py3_13t-cpu-test with: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: xpu - GPU_ARCH_TYPE: xpu - DESIRED_PYTHON: "3.13" - build_name: wheel-py3_13-xpu + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + DESIRED_PYTHON: "3.13t" + build_name: wheel-py3_13t-cpu secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml - wheel-py3_13t-cpu-build: +<<<<<<< HEAD + wheel-py3_13t-cuda11_8-build: if: ${{ github.repository_owner == 'pytorch' }} needs: get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" @@ -5900,8 +8280,9 @@ jobs: PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cpu - GPU_ARCH_TYPE: cpu + DESIRED_CUDA: cu118 + GPU_ARCH_VERSION: 11.8 + GPU_ARCH_TYPE: cuda SKIP_ALL_TESTS: 1 DESIRED_PYTHON: "3.13t" steps: @@ -5980,7 +8361,7 @@ jobs: - uses: actions/upload-artifact@v4.4.0 if: always() with: - name: wheel-py3_13t-cpu + name: wheel-py3_13t-cuda11_8 retention-days: 14 if-no-files-found: error path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" @@ -5997,20 +8378,21 @@ jobs: if: always() run: | .github\scripts\kill_active_ssh_sessions.ps1 - wheel-py3_13t-cpu-test: # Testing + wheel-py3_13t-cuda11_8-test: # Testing if: ${{ github.repository_owner == 'pytorch' }} needs: - - wheel-py3_13t-cpu-build + - wheel-py3_13t-cuda11_8-build - get-label-type - runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" + runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge" timeout-minutes: 300 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cpu - GPU_ARCH_TYPE: cpu + DESIRED_CUDA: cu118 + GPU_ARCH_VERSION: 11.8 + GPU_ARCH_TYPE: cuda SKIP_ALL_TESTS: 1 DESIRED_PYTHON: "3.13t" steps: @@ -6070,7 +8452,7 @@ jobs: - uses: actions/download-artifact@v4.1.7 name: Download Build Artifacts with: - name: wheel-py3_13t-cpu + name: wheel-py3_13t-cuda11_8 path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - name: Checkout PyTorch uses: actions/checkout@v4 @@ -6104,25 +8486,28 @@ jobs: if: always() run: | .github\scripts\kill_active_ssh_sessions.ps1 - wheel-py3_13t-cpu-upload: # Uploading + wheel-py3_13t-cuda11_8-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} permissions: id-token: write contents: read - needs: wheel-py3_13t-cpu-test + needs: wheel-py3_13t-cuda11_8-test with: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cpu - GPU_ARCH_TYPE: cpu + DESIRED_CUDA: cu118 + GPU_ARCH_VERSION: 11.8 + GPU_ARCH_TYPE: cuda DESIRED_PYTHON: "3.13t" - build_name: wheel-py3_13t-cpu + build_name: wheel-py3_13t-cuda11_8 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml - wheel-py3_13t-cuda11_8-build: +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) + wheel-py3_13t-cuda12_6-build: if: ${{ github.repository_owner == 'pytorch' }} needs: get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" @@ -6132,12 +8517,24 @@ jobs: PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu118 - GPU_ARCH_VERSION: 11.8 + DESIRED_CUDA: cu126 + GPU_ARCH_VERSION: 12.6 GPU_ARCH_TYPE: cuda SKIP_ALL_TESTS: 1 DESIRED_PYTHON: "3.13t" steps: +<<<<<<< HEAD +======= + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: Display EC2 information shell: bash run: | @@ -6153,7 +8550,11 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7 +======= + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -6182,6 +8583,7 @@ jobs: # Let's both exclude the path and disable Windows Defender completely just to be sure # that it doesn't interfere Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore +<<<<<<< HEAD # NOTE: These environment variables are put here so that they can be applied on every job equally # They are also here because setting them at a workflow level doesn't give us access to the # runner.temp variable, which we need. @@ -6191,6 +8593,8 @@ jobs: echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: Checkout PyTorch uses: actions/checkout@v4 with: @@ -6213,7 +8617,7 @@ jobs: - uses: actions/upload-artifact@v4.4.0 if: always() with: - name: wheel-py3_13t-cuda11_8 + name: wheel-py3_13t-cuda12_6 retention-days: 14 if-no-files-found: error path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" @@ -6230,10 +8634,14 @@ jobs: if: always() run: | .github\scripts\kill_active_ssh_sessions.ps1 - wheel-py3_13t-cuda11_8-test: # Testing +<<<<<<< HEAD +======= + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) + wheel-py3_13t-cuda12_6-test: # Testing if: ${{ github.repository_owner == 'pytorch' }} needs: - - wheel-py3_13t-cuda11_8-build + - wheel-py3_13t-cuda12_6-build - get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge" timeout-minutes: 300 @@ -6242,8 +8650,8 @@ jobs: PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu118 - GPU_ARCH_VERSION: 11.8 + DESIRED_CUDA: cu126 + GPU_ARCH_VERSION: 12.6 GPU_ARCH_TYPE: cuda SKIP_ALL_TESTS: 1 DESIRED_PYTHON: "3.13t" @@ -6263,7 +8671,11 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7 +======= + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -6292,6 +8704,20 @@ jobs: # Let's both exclude the path and disable Windows Defender completely just to be sure # that it doesn't interfere Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore +<<<<<<< HEAD +======= + - name: Checkout PyTorch + uses: actions/checkout@v4 + with: + submodules: recursive + path: pytorch + show-progress: false + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # NOTE: These environment variables are put here so that they can be applied on every job equally # They are also here because setting them at a workflow level doesn't give us access to the # runner.temp variable, which we need. @@ -6304,8 +8730,9 @@ jobs: - uses: actions/download-artifact@v4.1.7 name: Download Build Artifacts with: - name: wheel-py3_13t-cuda11_8 + name: wheel-py3_13t-cuda12_6 path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" +<<<<<<< HEAD - name: Checkout PyTorch uses: actions/checkout@v4 with: @@ -6317,6 +8744,8 @@ jobs: # Remove any artifacts from the previous checkouts git clean -fxd working-directory: pytorch +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: Populate binary env shell: bash run: | @@ -6338,26 +8767,26 @@ jobs: if: always() run: | .github\scripts\kill_active_ssh_sessions.ps1 - wheel-py3_13t-cuda11_8-upload: # Uploading + wheel-py3_13t-cuda12_6-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} permissions: id-token: write contents: read - needs: wheel-py3_13t-cuda11_8-test + needs: wheel-py3_13t-cuda12_6-test with: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu118 - GPU_ARCH_VERSION: 11.8 + DESIRED_CUDA: cu126 + GPU_ARCH_VERSION: 12.6 GPU_ARCH_TYPE: cuda DESIRED_PYTHON: "3.13t" - build_name: wheel-py3_13t-cuda11_8 + build_name: wheel-py3_13t-cuda12_6 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml - wheel-py3_13t-cuda12_6-build: + wheel-py3_13t-cuda12_8-build: if: ${{ github.repository_owner == 'pytorch' }} needs: get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" @@ -6367,12 +8796,24 @@ jobs: PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu126 - GPU_ARCH_VERSION: 12.6 + DESIRED_CUDA: cu128 + GPU_ARCH_VERSION: 12.8 GPU_ARCH_TYPE: cuda SKIP_ALL_TESTS: 1 DESIRED_PYTHON: "3.13t" steps: +<<<<<<< HEAD +======= + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: Display EC2 information shell: bash run: | @@ -6388,7 +8829,11 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7 +======= + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -6417,6 +8862,7 @@ jobs: # Let's both exclude the path and disable Windows Defender completely just to be sure # that it doesn't interfere Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore +<<<<<<< HEAD # NOTE: These environment variables are put here so that they can be applied on every job equally # They are also here because setting them at a workflow level doesn't give us access to the # runner.temp variable, which we need. @@ -6426,6 +8872,8 @@ jobs: echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: Checkout PyTorch uses: actions/checkout@v4 with: @@ -6448,7 +8896,7 @@ jobs: - uses: actions/upload-artifact@v4.4.0 if: always() with: - name: wheel-py3_13t-cuda12_6 + name: wheel-py3_13t-cuda12_8 retention-days: 14 if-no-files-found: error path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" @@ -6465,10 +8913,14 @@ jobs: if: always() run: | .github\scripts\kill_active_ssh_sessions.ps1 - wheel-py3_13t-cuda12_6-test: # Testing +<<<<<<< HEAD +======= + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) + wheel-py3_13t-cuda12_8-test: # Testing if: ${{ github.repository_owner == 'pytorch' }} needs: - - wheel-py3_13t-cuda12_6-build + - wheel-py3_13t-cuda12_8-build - get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge" timeout-minutes: 300 @@ -6477,8 +8929,8 @@ jobs: PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu126 - GPU_ARCH_VERSION: 12.6 + DESIRED_CUDA: cu128 + GPU_ARCH_VERSION: 12.8 GPU_ARCH_TYPE: cuda SKIP_ALL_TESTS: 1 DESIRED_PYTHON: "3.13t" @@ -6498,7 +8950,11 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7 +======= + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -6527,6 +8983,20 @@ jobs: # Let's both exclude the path and disable Windows Defender completely just to be sure # that it doesn't interfere Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore +<<<<<<< HEAD +======= + - name: Checkout PyTorch + uses: actions/checkout@v4 + with: + submodules: recursive + path: pytorch + show-progress: false + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # NOTE: These environment variables are put here so that they can be applied on every job equally # They are also here because setting them at a workflow level doesn't give us access to the # runner.temp variable, which we need. @@ -6539,8 +9009,9 @@ jobs: - uses: actions/download-artifact@v4.1.7 name: Download Build Artifacts with: - name: wheel-py3_13t-cuda12_6 + name: wheel-py3_13t-cuda12_8 path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" +<<<<<<< HEAD - name: Checkout PyTorch uses: actions/checkout@v4 with: @@ -6552,6 +9023,8 @@ jobs: # Remove any artifacts from the previous checkouts git clean -fxd working-directory: pytorch +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: Populate binary env shell: bash run: | @@ -6573,26 +9046,30 @@ jobs: if: always() run: | .github\scripts\kill_active_ssh_sessions.ps1 - wheel-py3_13t-cuda12_6-upload: # Uploading + wheel-py3_13t-cuda12_8-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} permissions: id-token: write contents: read - needs: wheel-py3_13t-cuda12_6-test + needs: wheel-py3_13t-cuda12_8-test with: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu126 - GPU_ARCH_VERSION: 12.6 + DESIRED_CUDA: cu128 + GPU_ARCH_VERSION: 12.8 GPU_ARCH_TYPE: cuda DESIRED_PYTHON: "3.13t" - build_name: wheel-py3_13t-cuda12_6 + build_name: wheel-py3_13t-cuda12_8 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml - wheel-py3_13t-cuda12_8-build: +<<<<<<< HEAD + wheel-py3_13t-xpu-build: +======= + wheel-py3_13t-cuda12_9-build: +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if: ${{ github.repository_owner == 'pytorch' }} needs: get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" @@ -6602,12 +9079,30 @@ jobs: PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu128 - GPU_ARCH_VERSION: 12.8 +<<<<<<< HEAD + DESIRED_CUDA: xpu + GPU_ARCH_TYPE: xpu + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.13t" + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.0.4; platform_system == 'Linux' | intel-cmplr-lib-ur==2025.0.4; platform_system == 'Linux' | intel-cmplr-lic-rt==2025.0.4; platform_system == 'Linux' | intel-sycl-rt==2025.0.4; platform_system == 'Linux' | intel-cmplr-lib-rt==2025.0.5; platform_system == 'Windows' | intel-cmplr-lib-ur==2025.0.5; platform_system == 'Windows' | intel-cmplr-lic-rt==2025.0.5; platform_system == 'Windows' | intel-sycl-rt==2025.0.5; platform_system == 'Windows' | tcmlib==1.2.0 | umf==0.9.1 | intel-pti==0.10.1 + steps: +======= + DESIRED_CUDA: cu129 + GPU_ARCH_VERSION: 12.9 GPU_ARCH_TYPE: cuda SKIP_ALL_TESTS: 1 DESIRED_PYTHON: "3.13t" steps: + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: Display EC2 information shell: bash run: | @@ -6623,7 +9118,11 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7 +======= + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -6652,15 +9151,8 @@ jobs: # Let's both exclude the path and disable Windows Defender completely just to be sure # that it doesn't interfere Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore - # NOTE: These environment variables are put here so that they can be applied on every job equally - # They are also here because setting them at a workflow level doesn't give us access to the - # runner.temp variable, which we need. - - name: Populate binary env - shell: bash - run: | - echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" - echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" - echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" +<<<<<<< HEAD +======= - name: Checkout PyTorch uses: actions/checkout@v4 with: @@ -6683,7 +9175,7 @@ jobs: - uses: actions/upload-artifact@v4.4.0 if: always() with: - name: wheel-py3_13t-cuda12_8 + name: wheel-py3_13t-cuda12_9 retention-days: 14 if-no-files-found: error path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" @@ -6700,10 +9192,11 @@ jobs: if: always() run: | .github\scripts\kill_active_ssh_sessions.ps1 - wheel-py3_13t-cuda12_8-test: # Testing + + wheel-py3_13t-cuda12_9-test: # Testing if: ${{ github.repository_owner == 'pytorch' }} needs: - - wheel-py3_13t-cuda12_8-build + - wheel-py3_13t-cuda12_9-build - get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge" timeout-minutes: 300 @@ -6712,8 +9205,8 @@ jobs: PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu128 - GPU_ARCH_VERSION: 12.8 + DESIRED_CUDA: cu129 + GPU_ARCH_VERSION: 12.9 GPU_ARCH_TYPE: cuda SKIP_ALL_TESTS: 1 DESIRED_PYTHON: "3.13t" @@ -6733,7 +9226,7 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7 + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8 continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -6762,6 +9255,18 @@ jobs: # Let's both exclude the path and disable Windows Defender completely just to be sure # that it doesn't interfere Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore + - name: Checkout PyTorch + uses: actions/checkout@v4 + with: + submodules: recursive + path: pytorch + show-progress: false + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # NOTE: These environment variables are put here so that they can be applied on every job equally # They are also here because setting them at a workflow level doesn't give us access to the # runner.temp variable, which we need. @@ -6771,22 +9276,13 @@ jobs: echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" +<<<<<<< HEAD +======= - uses: actions/download-artifact@v4.1.7 name: Download Build Artifacts with: - name: wheel-py3_13t-cuda12_8 + name: wheel-py3_13t-cuda12_9 path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - - name: Checkout PyTorch - uses: actions/checkout@v4 - with: - submodules: recursive - path: pytorch - show-progress: false - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: pytorch - name: Populate binary env shell: bash run: | @@ -6808,22 +9304,22 @@ jobs: if: always() run: | .github\scripts\kill_active_ssh_sessions.ps1 - wheel-py3_13t-cuda12_8-upload: # Uploading + wheel-py3_13t-cuda12_9-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} permissions: id-token: write contents: read - needs: wheel-py3_13t-cuda12_8-test + needs: wheel-py3_13t-cuda12_9-test with: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu128 - GPU_ARCH_VERSION: 12.8 + DESIRED_CUDA: cu129 + GPU_ARCH_VERSION: 12.9 GPU_ARCH_TYPE: cuda DESIRED_PYTHON: "3.13t" - build_name: wheel-py3_13t-cuda12_8 + build_name: wheel-py3_13t-cuda12_9 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml @@ -6841,8 +9337,17 @@ jobs: GPU_ARCH_TYPE: xpu SKIP_ALL_TESTS: 1 DESIRED_PYTHON: "3.13t" - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.0.4; platform_system == 'Linux' | intel-cmplr-lib-ur==2025.0.4; platform_system == 'Linux' | intel-cmplr-lic-rt==2025.0.4; platform_system == 'Linux' | intel-sycl-rt==2025.0.4; platform_system == 'Linux' | intel-cmplr-lib-rt==2025.0.5; platform_system == 'Windows' | intel-cmplr-lib-ur==2025.0.5; platform_system == 'Windows' | intel-cmplr-lic-rt==2025.0.5; platform_system == 'Windows' | intel-sycl-rt==2025.0.5; platform_system == 'Windows' | tcmlib==1.2.0 | umf==0.9.1 | intel-pti==0.10.1 + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.1.1 | intel-cmplr-lib-ur==2025.1.1 | intel-cmplr-lic-rt==2025.1.1 | intel-sycl-rt==2025.1.1 | oneccl-devel==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.15.0; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.1.0 | onemkl-sycl-dft==2025.1.0 | onemkl-sycl-lapack==2025.1.0 | onemkl-sycl-rng==2025.1.0 | onemkl-sycl-sparse==2025.1.0 | dpcpp-cpp-rt==2025.1.1 | intel-opencl-rt==2025.1.1 | mkl==2025.1.0 | intel-openmp==2025.1.1 | tbb==2022.1.0 | tcmlib==1.3.0 | umf==0.10.0 | intel-pti==0.12.3 steps: + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" - name: Display EC2 information shell: bash run: | @@ -6858,7 +9363,7 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7 + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8 continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -6887,15 +9392,7 @@ jobs: # Let's both exclude the path and disable Windows Defender completely just to be sure # that it doesn't interfere Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore - # NOTE: These environment variables are put here so that they can be applied on every job equally - # They are also here because setting them at a workflow level doesn't give us access to the - # runner.temp variable, which we need. - - name: Populate binary env - shell: bash - run: | - echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" - echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" - echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: Checkout PyTorch uses: actions/checkout@v4 with: @@ -6935,6 +9432,10 @@ jobs: if: always() run: | .github\scripts\kill_active_ssh_sessions.ps1 +<<<<<<< HEAD +======= + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) wheel-py3_13t-xpu-test: # Testing if: ${{ github.repository_owner == 'pytorch' }} needs: @@ -6967,7 +9468,11 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7 +======= + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -6996,6 +9501,20 @@ jobs: # Let's both exclude the path and disable Windows Defender completely just to be sure # that it doesn't interfere Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore +<<<<<<< HEAD +======= + - name: Checkout PyTorch + uses: actions/checkout@v4 + with: + submodules: recursive + path: pytorch + show-progress: false + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # NOTE: These environment variables are put here so that they can be applied on every job equally # They are also here because setting them at a workflow level doesn't give us access to the # runner.temp variable, which we need. @@ -7010,6 +9529,7 @@ jobs: with: name: wheel-py3_13t-xpu path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" +<<<<<<< HEAD - name: Checkout PyTorch uses: actions/checkout@v4 with: @@ -7021,6 +9541,8 @@ jobs: # Remove any artifacts from the previous checkouts git clean -fxd working-directory: pytorch +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: Populate binary env shell: bash run: | diff --git a/.github/workflows/h100-distributed.yml b/.github/workflows/h100-distributed.yml new file mode 100644 index 000000000000..6283a0e8fbec --- /dev/null +++ b/.github/workflows/h100-distributed.yml @@ -0,0 +1,53 @@ +name: Limited CI for distributed tests on H100 + +on: + pull_request: + paths: + - .github/workflows/h100-distributed.yml + workflow_dispatch: + push: + tags: + - ciflow/h100-distributed/* + +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }} + cancel-in-progress: true + +jobs: + + get-label-type: + if: github.repository_owner == 'pytorch' + name: get-label-type + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8 + with: + triggering_actor: ${{ github.triggering_actor }} + issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} + curr_branch: ${{ github.head_ref || github.ref_name }} + curr_ref_type: ${{ github.ref_type }} + + linux-jammy-cuda12_8-py3_10-gcc11-sm90-build-dist: + name: linux-jammy-cuda12.8-py3.10-gcc11-sm90-dist + uses: ./.github/workflows/_linux-build.yml + needs: get-label-type + with: + runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + runner: "linux.12xlarge" + build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm90-dist + docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11 + cuda-arch-list: '9.0' + test-matrix: | + { include: [ + { config: "h100_distributed", shard: 1, num_shards: 1, runner: "linux.aws.h100.8" }, + ]} + secrets: inherit + + linux-jammy-cuda12_8-py3_10-gcc11-sm90-test: + name: linux-jammy-cuda12.8-py3.10-gcc11-sm90-dist + uses: ./.github/workflows/_linux-test.yml + needs: + - linux-jammy-cuda12_8-py3_10-gcc11-sm90-build-dist + with: + build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm90-dist + docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-sm90-build-dist.outputs.docker-image }} + test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-sm90-build-dist.outputs.test-matrix }} + secrets: inherit diff --git a/.github/workflows/inductor-micro-benchmark-x86.yml b/.github/workflows/inductor-micro-benchmark-x86.yml index bcdfcedc2abf..e1cb27e151f2 100644 --- a/.github/workflows/inductor-micro-benchmark-x86.yml +++ b/.github/workflows/inductor-micro-benchmark-x86.yml @@ -22,7 +22,11 @@ jobs: uses: ./.github/workflows/_linux-build.yml with: build-environment: linux-jammy-py3.9-gcc11 +<<<<<<< HEAD docker-image-name: pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks +======= + docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # Use metal host for benchmark jobs test-matrix: | { include: [ diff --git a/.github/workflows/inductor-micro-benchmark.yml b/.github/workflows/inductor-micro-benchmark.yml index dabb071bbc5e..46675f45b1c0 100644 --- a/.github/workflows/inductor-micro-benchmark.yml +++ b/.github/workflows/inductor-micro-benchmark.yml @@ -18,23 +18,39 @@ permissions: read-all jobs: get-default-label-prefix: name: get-default-label-prefix +<<<<<<< HEAD uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.7 +======= + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }} with: triggering_actor: ${{ github.triggering_actor }} issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} curr_branch: ${{ github.head_ref || github.ref_name }} curr_ref_type: ${{ github.ref_type }} +<<<<<<< HEAD linux-focal-cuda12_6-py3_10-gcc9-inductor-micro-benchmark-build: name: cuda12.6-py3.10-gcc9-sm80 +======= + opt_out_experiments: lf + + build: + name: cuda12.8-py3.10-gcc9-sm80 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) uses: ./.github/workflows/_linux-build.yml needs: - get-default-label-prefix with: runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}" +<<<<<<< HEAD build-environment: linux-focal-cuda12.6-py3.10-gcc9-sm80 docker-image-name: pytorch-linux-focal-cuda12.6-cudnn9-py3-gcc9-inductor-benchmarks +======= + build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80 + docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) cuda-arch-list: '8.0' test-matrix: | { include: [ @@ -42,6 +58,7 @@ jobs: ]} secrets: inherit +<<<<<<< HEAD linux-focal-cuda12_6-py3_10-gcc9-inductor-micro-benchmark-test: name: cuda12.6-py3.10-gcc9-sm80 uses: ./.github/workflows/_linux-test.yml @@ -50,5 +67,15 @@ jobs: build-environment: linux-focal-cuda12.6-py3.10-gcc9-sm80 docker-image: ${{ needs.linux-focal-cuda12_6-py3_10-gcc9-inductor-micro-benchmark-build.outputs.docker-image }} test-matrix: ${{ needs.linux-focal-cuda12_6-py3_10-gcc9-inductor-micro-benchmark-build.outputs.test-matrix }} +======= + test: + name: cuda12.8-py3.10-gcc9-sm80 + uses: ./.github/workflows/_linux-test.yml + needs: build + with: + build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80 + docker-image: ${{ needs.build.outputs.docker-image }} + test-matrix: ${{ needs.build.outputs.test-matrix }} +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) timeout-minutes: 720 secrets: inherit diff --git a/.github/workflows/inductor-nightly.yml b/.github/workflows/inductor-nightly.yml index 31ed751bf440..8d910c42f47f 100644 --- a/.github/workflows/inductor-nightly.yml +++ b/.github/workflows/inductor-nightly.yml @@ -4,6 +4,12 @@ on: pull_request: paths: - .github/workflows/inductor-nightly.yml +<<<<<<< HEAD +======= + - benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_max_autotune_inductor_amp_freezing_huggingface_inference.csv + - benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_max_autotune_inductor_amp_freezing_timm_inference.csv + - benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_max_autotune_inductor_amp_freezing_torchbench_inference.csv +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) workflow_dispatch: schedule: # Run every day at 7:00 AM UTC @@ -18,13 +24,21 @@ permissions: read-all jobs: get-default-label-prefix: name: get-default-label-prefix +<<<<<<< HEAD uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.7 +======= + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }} with: triggering_actor: ${{ github.triggering_actor }} issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} curr_branch: ${{ github.head_ref || github.ref_name }} curr_ref_type: ${{ github.ref_type }} +<<<<<<< HEAD +======= + opt_out_experiments: lf +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) linux-jammy-cpu-py3_9-gcc11-nightly-dynamo-benchmarks-build: name: linux-jammy-cpu-py3.9-gcc11-nightly-dynamo-benchmarks @@ -32,7 +46,11 @@ jobs: needs: get-default-label-prefix with: build-environment: linux-jammy-py3.9-gcc11-build +<<<<<<< HEAD docker-image-name: pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks +======= + docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}" test-matrix: | { include: [ diff --git a/.github/workflows/inductor-perf-compare.yml b/.github/workflows/inductor-perf-compare.yml index 2a12f3440ee5..39199924c29e 100644 --- a/.github/workflows/inductor-perf-compare.yml +++ b/.github/workflows/inductor-perf-compare.yml @@ -16,22 +16,38 @@ jobs: get-default-label-prefix: if: github.repository_owner == 'pytorch' name: get-default-label-prefix +<<<<<<< HEAD uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.7 +======= + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: triggering_actor: ${{ github.triggering_actor }} issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} curr_branch: ${{ github.head_ref || github.ref_name }} curr_ref_type: ${{ github.ref_type }} +<<<<<<< HEAD linux-focal-cuda12_6-py3_10-gcc9-inductor-build: name: cuda12.6-py3.10-gcc9-sm80 +======= + opt_out_experiments: lf + + build: + name: cuda12.8-py3.10-gcc9-sm80 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) uses: ./.github/workflows/_linux-build.yml needs: - get-default-label-prefix with: runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}" +<<<<<<< HEAD build-environment: linux-focal-cuda12.6-py3.10-gcc9-sm80 docker-image-name: pytorch-linux-focal-cuda12.6-cudnn9-py3-gcc9-inductor-benchmarks +======= + build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80 + docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) cuda-arch-list: '8.0' test-matrix: | { include: [ @@ -42,6 +58,7 @@ jobs: ]} secrets: inherit +<<<<<<< HEAD linux-focal-cuda12_6-py3_10-gcc9-inductor-test: name: cuda12.6-py3.10-gcc9-sm80 uses: ./.github/workflows/_linux-test.yml @@ -52,4 +69,18 @@ jobs: test-matrix: ${{ needs.linux-focal-cuda12_6-py3_10-gcc9-inductor-build.outputs.test-matrix }} # disable monitor in perf tests for more investigation disable-monitor: true +======= + test: + name: cuda12.8-py3.10-gcc9-sm80 + uses: ./.github/workflows/_linux-test.yml + needs: build + with: + build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80 + docker-image: ${{ needs.build.outputs.docker-image }} + test-matrix: ${{ needs.build.outputs.test-matrix }} + # disable monitor in perf tests for more investigation + disable-monitor: false + monitor-log-interval: 15 + monitor-data-collect-interval: 4 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) secrets: inherit diff --git a/.github/workflows/inductor-perf-test-nightly-aarch64.yml b/.github/workflows/inductor-perf-test-nightly-aarch64.yml index 2ee84e45ecc2..631d6c05081b 100644 --- a/.github/workflows/inductor-perf-test-nightly-aarch64.yml +++ b/.github/workflows/inductor-perf-test-nightly-aarch64.yml @@ -53,13 +53,21 @@ permissions: read-all jobs: get-label-type: name: get-label-type +<<<<<<< HEAD uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.7 +======= + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }} with: triggering_actor: ${{ github.triggering_actor }} issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} curr_branch: ${{ github.head_ref || github.ref_name }} curr_ref_type: ${{ github.ref_type }} +<<<<<<< HEAD +======= + opt_out_experiments: lf +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) linux-jammy-aarch64-py3_10-inductor-build: name: linux-jammy-aarch64-py3.10-inductor @@ -69,7 +77,11 @@ jobs: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" runner: linux.arm64.m7g.4xlarge build-environment: linux-jammy-aarch64-py3.10 +<<<<<<< HEAD docker-image-name: pytorch-linux-jammy-aarch64-py3.10-gcc11-inductor-benchmarks +======= + docker-image-name: ci-image:pytorch-linux-jammy-aarch64-py3.10-gcc11-inductor-benchmarks +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) test-matrix: | { include: [ { config: "inductor_huggingface_perf_cpu_aarch64", shard: 1, num_shards: 9, runner: "linux.arm64.m7g.metal" }, @@ -96,6 +108,7 @@ jobs: { config: "inductor_timm_perf_cpu_aarch64", shard: 13, num_shards: 15, runner: "linux.arm64.m7g.metal" }, { config: "inductor_timm_perf_cpu_aarch64", shard: 14, num_shards: 15, runner: "linux.arm64.m7g.metal" }, { config: "inductor_timm_perf_cpu_aarch64", shard: 15, num_shards: 15, runner: "linux.arm64.m7g.metal" }, +<<<<<<< HEAD { config: "inductor_torchbench_perf_cpu_aarch64", shard: 1, num_shards: 12, runner: "linux.arm64.m7g.metal" }, { config: "inductor_torchbench_perf_cpu_aarch64", shard: 2, num_shards: 12, runner: "linux.arm64.m7g.metal" }, { config: "inductor_torchbench_perf_cpu_aarch64", shard: 3, num_shards: 12, runner: "linux.arm64.m7g.metal" }, @@ -108,6 +121,23 @@ jobs: { config: "inductor_torchbench_perf_cpu_aarch64", shard: 10, num_shards: 12, runner: "linux.arm64.m7g.metal" }, { config: "inductor_torchbench_perf_cpu_aarch64", shard: 11, num_shards: 12, runner: "linux.arm64.m7g.metal" }, { config: "inductor_torchbench_perf_cpu_aarch64", shard: 12, num_shards: 12, runner: "linux.arm64.m7g.metal" }, +======= + { config: "inductor_torchbench_perf_cpu_aarch64", shard: 1, num_shards: 15, runner: "linux.arm64.m7g.metal" }, + { config: "inductor_torchbench_perf_cpu_aarch64", shard: 2, num_shards: 15, runner: "linux.arm64.m7g.metal" }, + { config: "inductor_torchbench_perf_cpu_aarch64", shard: 3, num_shards: 15, runner: "linux.arm64.m7g.metal" }, + { config: "inductor_torchbench_perf_cpu_aarch64", shard: 4, num_shards: 15, runner: "linux.arm64.m7g.metal" }, + { config: "inductor_torchbench_perf_cpu_aarch64", shard: 5, num_shards: 15, runner: "linux.arm64.m7g.metal" }, + { config: "inductor_torchbench_perf_cpu_aarch64", shard: 6, num_shards: 15, runner: "linux.arm64.m7g.metal" }, + { config: "inductor_torchbench_perf_cpu_aarch64", shard: 7, num_shards: 15, runner: "linux.arm64.m7g.metal" }, + { config: "inductor_torchbench_perf_cpu_aarch64", shard: 8, num_shards: 15, runner: "linux.arm64.m7g.metal" }, + { config: "inductor_torchbench_perf_cpu_aarch64", shard: 9, num_shards: 15, runner: "linux.arm64.m7g.metal" }, + { config: "inductor_torchbench_perf_cpu_aarch64", shard: 10, num_shards: 15, runner: "linux.arm64.m7g.metal" }, + { config: "inductor_torchbench_perf_cpu_aarch64", shard: 11, num_shards: 15, runner: "linux.arm64.m7g.metal" }, + { config: "inductor_torchbench_perf_cpu_aarch64", shard: 12, num_shards: 15, runner: "linux.arm64.m7g.metal" }, + { config: "inductor_torchbench_perf_cpu_aarch64", shard: 13, num_shards: 15, runner: "linux.arm64.m7g.metal" }, + { config: "inductor_torchbench_perf_cpu_aarch64", shard: 14, num_shards: 15, runner: "linux.arm64.m7g.metal" }, + { config: "inductor_torchbench_perf_cpu_aarch64", shard: 15, num_shards: 15, runner: "linux.arm64.m7g.metal" }, +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ]} selected-test-configs: ${{ inputs.benchmark_configs }} secrets: inherit @@ -125,7 +155,13 @@ jobs: test-matrix: ${{ needs.linux-jammy-aarch64-py3_10-inductor-build.outputs.test-matrix }} timeout-minutes: 720 # disable monitor in perf tests for more investigation +<<<<<<< HEAD disable-monitor: true +======= + disable-monitor: false + monitor-log-interval: 15 + monitor-data-collect-interval: 4 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) secrets: inherit diff --git a/.github/workflows/inductor-perf-test-nightly-h100.yml b/.github/workflows/inductor-perf-test-nightly-h100.yml index 682df7b212b4..13581a2c5e4b 100644 --- a/.github/workflows/inductor-perf-test-nightly-h100.yml +++ b/.github/workflows/inductor-perf-test-nightly-h100.yml @@ -2,7 +2,11 @@ name: inductor-perf-nightly-h100 on: schedule: +<<<<<<< HEAD - cron: 0 7 * * 1-6 +======= + - cron: 15 0,4,8,12,16,20 * * 1-6 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - cron: 0 7 * * 0 # NB: GitHub has an upper limit of 10 inputs here, so before we can sort it # out, let try to run torchao cudagraphs_low_precision as part of cudagraphs @@ -68,23 +72,40 @@ permissions: read-all jobs: get-label-type: name: get-label-type +<<<<<<< HEAD uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.7 +======= + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }} with: triggering_actor: ${{ github.triggering_actor }} issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} curr_branch: ${{ github.head_ref || github.ref_name }} curr_ref_type: ${{ github.ref_type }} +<<<<<<< HEAD # NB: Keep this in sync with trunk.yml build: name: cuda12.6-py3.10-gcc9-sm90 +======= + opt_out_experiments: lf + + # NB: Keep this in sync with trunk.yml + build: + name: cuda12.8-py3.10-gcc9-sm90 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) uses: ./.github/workflows/_linux-build.yml needs: get-label-type with: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" +<<<<<<< HEAD build-environment: linux-focal-cuda12.6-py3.10-gcc9-sm90 docker-image-name: pytorch-linux-focal-cuda12.6-cudnn9-py3-gcc9-inductor-benchmarks +======= + build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm90 + docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) cuda-arch-list: '9.0' test-matrix: | { include: [ @@ -93,6 +114,7 @@ jobs: { config: "inductor_huggingface_perf_cuda_h100", shard: 3, num_shards: 5, runner: "linux.aws.h100" }, { config: "inductor_huggingface_perf_cuda_h100", shard: 4, num_shards: 5, runner: "linux.aws.h100" }, { config: "inductor_huggingface_perf_cuda_h100", shard: 5, num_shards: 5, runner: "linux.aws.h100" }, +<<<<<<< HEAD { config: "inductor_timm_perf_cuda_h100", shard: 1, num_shards: 6, runner: "linux.aws.h100" }, { config: "inductor_timm_perf_cuda_h100", shard: 2, num_shards: 6, runner: "linux.aws.h100" }, { config: "inductor_timm_perf_cuda_h100", shard: 3, num_shards: 6, runner: "linux.aws.h100" }, @@ -105,10 +127,29 @@ jobs: { config: "inductor_torchbench_perf_cuda_h100", shard: 4, num_shards: 6, runner: "linux.aws.h100" }, { config: "inductor_torchbench_perf_cuda_h100", shard: 5, num_shards: 6, runner: "linux.aws.h100" }, { config: "inductor_torchbench_perf_cuda_h100", shard: 6, num_shards: 6, runner: "linux.aws.h100" }, +======= + { config: "inductor_timm_perf_cuda_h100", shard: 1, num_shards: 7, runner: "linux.aws.h100" }, + { config: "inductor_timm_perf_cuda_h100", shard: 2, num_shards: 7, runner: "linux.aws.h100" }, + { config: "inductor_timm_perf_cuda_h100", shard: 3, num_shards: 7, runner: "linux.aws.h100" }, + { config: "inductor_timm_perf_cuda_h100", shard: 4, num_shards: 7, runner: "linux.aws.h100" }, + { config: "inductor_timm_perf_cuda_h100", shard: 5, num_shards: 7, runner: "linux.aws.h100" }, + { config: "inductor_timm_perf_cuda_h100", shard: 6, num_shards: 7, runner: "linux.aws.h100" }, + { config: "inductor_timm_perf_cuda_h100", shard: 7, num_shards: 7, runner: "linux.aws.h100" }, + { config: "inductor_torchbench_perf_cuda_h100", shard: 1, num_shards: 9, runner: "linux.aws.h100" }, + { config: "inductor_torchbench_perf_cuda_h100", shard: 2, num_shards: 9, runner: "linux.aws.h100" }, + { config: "inductor_torchbench_perf_cuda_h100", shard: 3, num_shards: 9, runner: "linux.aws.h100" }, + { config: "inductor_torchbench_perf_cuda_h100", shard: 4, num_shards: 9, runner: "linux.aws.h100" }, + { config: "inductor_torchbench_perf_cuda_h100", shard: 5, num_shards: 9, runner: "linux.aws.h100" }, + { config: "inductor_torchbench_perf_cuda_h100", shard: 6, num_shards: 9, runner: "linux.aws.h100" }, + { config: "inductor_torchbench_perf_cuda_h100", shard: 7, num_shards: 9, runner: "linux.aws.h100" }, + { config: "inductor_torchbench_perf_cuda_h100", shard: 8, num_shards: 9, runner: "linux.aws.h100" }, + { config: "inductor_torchbench_perf_cuda_h100", shard: 9, num_shards: 9, runner: "linux.aws.h100" }, +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ]} selected-test-configs: ${{ inputs.benchmark_configs }} secrets: inherit +<<<<<<< HEAD test-nightly: name: cuda12.6-py3.10-gcc9-sm90 uses: ./.github/workflows/_linux-test.yml @@ -116,40 +157,85 @@ jobs: if: github.event.schedule == '0 7 * * 1-6' with: build-environment: linux-focal-cuda12.6-py3.10-gcc9-sm90 +======= + test-periodically: + name: cuda12.8-py3.10-gcc9-sm90 + uses: ./.github/workflows/_linux-test.yml + needs: build + if: github.event.schedule == '15 0,4,8,12,16,20 * * 1-6' + with: + build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm90 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) dashboard-tag: training-true-inference-true-default-true-dynamic-true-cudagraphs-true-cppwrapper-true-aotinductor-true-freezing_cudagraphs-true-cudagraphs_low_precision-true docker-image: ${{ needs.build.outputs.docker-image }} test-matrix: ${{ needs.build.outputs.test-matrix }} timeout-minutes: 720 +<<<<<<< HEAD # disable monitor in perf tests for more investigation disable-monitor: true secrets: inherit test-weekly: name: cuda12.6-py3.10-gcc9-sm90 +======= + # disable monitor in perf tests, next step is to enable it + disable-monitor: false + monitor-log-interval: 15 + monitor-data-collect-interval: 4 + secrets: inherit + + test-weekly: + name: cuda12.8-py3.10-gcc9-sm90 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) uses: ./.github/workflows/_linux-test.yml needs: build if: github.event.schedule == '0 7 * * 0' with: +<<<<<<< HEAD build-environment: linux-focal-cuda12.6-py3.10-gcc9-sm90 +======= + build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm90 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) dashboard-tag: training-true-inference-true-default-true-dynamic-true-cudagraphs-true-cppwrapper-true-aotinductor-true-freezing_cudagraphs-true-maxautotune-true-freeze_autotune_cudagraphs-true-cudagraphs_low_precision-true docker-image: ${{ needs.build.outputs.docker-image }} test-matrix: ${{ needs.build.outputs.test-matrix }} timeout-minutes: 1440 +<<<<<<< HEAD # disable monitor in perf tests for more investigation disable-monitor: true secrets: inherit test: name: cuda12.6-py3.10-gcc9-sm90 +======= + # disable monitor in perf tests, next step is to enable it + disable-monitor: false + monitor-log-interval: 15 + monitor-data-collect-interval: 4 + secrets: inherit + + test: + name: cuda12.8-py3.10-gcc9-sm90 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) uses: ./.github/workflows/_linux-test.yml needs: build if: github.event_name == 'workflow_dispatch' with: +<<<<<<< HEAD build-environment: linux-focal-cuda12.6-py3.10-gcc9-sm90 +======= + build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm90 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) dashboard-tag: training-${{ inputs.training }}-inference-${{ inputs.inference }}-default-${{ inputs.default }}-dynamic-${{ inputs.dynamic }}-cudagraphs-${{ inputs.cudagraphs }}-cppwrapper-${{ inputs.cppwrapper }}-aotinductor-${{ inputs.aotinductor }}-maxautotune-${{ inputs.maxautotune }}-freezing_cudagraphs-${{ inputs.freezing_cudagraphs }}-cudagraphs_low_precision-${{ inputs.cudagraphs }} docker-image: ${{ needs.build.outputs.docker-image }} test-matrix: ${{ needs.build.outputs.test-matrix }} timeout-minutes: 720 # disable monitor in perf tests for more investigation +<<<<<<< HEAD disable-monitor: true +======= + disable-monitor: false + monitor-log-interval: 15 + monitor-data-collect-interval: 4 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) secrets: inherit diff --git a/.github/workflows/inductor-perf-test-nightly-macos.yml b/.github/workflows/inductor-perf-test-nightly-macos.yml index a63731f759b9..5107eed3b227 100644 --- a/.github/workflows/inductor-perf-test-nightly-macos.yml +++ b/.github/workflows/inductor-perf-test-nightly-macos.yml @@ -1,5 +1,9 @@ +<<<<<<< HEAD name: perf-nightly-macos # Technically not an inductor test, but uses it as a template for tracking macos performance +======= +name: inductor-perf-nightly-macos +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) on: schedule: @@ -21,9 +25,19 @@ on: required: false type: string default: torchbench_perf_mps +<<<<<<< HEAD concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }} +======= + pull_request: + paths: + - .github/workflows/inductor-perf-test-nightly-macos.yml + - .ci/pytorch/macos-test.sh + +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }} +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) cancel-in-progress: true permissions: read-all @@ -35,6 +49,7 @@ jobs: uses: ./.github/workflows/_mac-build.yml with: sync-tag: macos-perf-py3-arm64-build +<<<<<<< HEAD build-environment: macos-py3-arm64 runner-type: macos-m1-stable build-generates-artifacts: true @@ -43,6 +58,18 @@ jobs: test-matrix: | { include: [ { config: "perf_smoketest", shard: 1, num_shards: 1, runner: "macos-m1-14" }, +======= + build-environment: macos-py3-arm64-distributed + runner-type: macos-m1-stable + build-generates-artifacts: true + # To match the one pre-installed in the m1 runners + python-version: 3.12.7 + test-matrix: | + { include: [ + { config: "perf_smoketest", shard: 1, num_shards: 3, runner: "macos-m2-15" }, + { config: "perf_smoketest", shard: 2, num_shards: 3, runner: "macos-m2-15" }, + { config: "perf_smoketest", shard: 3, num_shards: 3, runner: "macos-m2-15" }, +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ]} secrets: inherit @@ -51,10 +78,21 @@ jobs: uses: ./.github/workflows/_mac-test.yml needs: macos-perf-py3-arm64-build with: +<<<<<<< HEAD build-environment: macos-py3-arm64 # Same as the build job python-version: 3.9.12 test-matrix: ${{ needs.macos-perf-py3-arm64-build.outputs.test-matrix }} # disable monitor in perf tests for more investigation disable-monitor: true +======= + build-environment: macos-py3-arm64-distributed + # Same as the build job + python-version: 3.12.7 + test-matrix: ${{ needs.macos-perf-py3-arm64-build.outputs.test-matrix }} + disable-monitor: false + monitor-log-interval: 15 + monitor-data-collect-interval: 4 + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) secrets: inherit diff --git a/.github/workflows/inductor-perf-test-nightly-rocm.yml b/.github/workflows/inductor-perf-test-nightly-rocm.yml index 30489f34254a..3212c0f66c31 100644 --- a/.github/workflows/inductor-perf-test-nightly-rocm.yml +++ b/.github/workflows/inductor-perf-test-nightly-rocm.yml @@ -70,13 +70,18 @@ permissions: read-all jobs: get-label-type: name: get-label-type +<<<<<<< HEAD uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.7 +======= + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }} with: triggering_actor: ${{ github.triggering_actor }} issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} curr_branch: ${{ github.head_ref || github.ref_name }} curr_ref_type: ${{ github.ref_type }} +<<<<<<< HEAD linux-focal-rocm6_3-py3_10-inductor-benchmark-build: if: github.repository_owner == 'pytorch' @@ -85,6 +90,17 @@ jobs: with: build-environment: linux-focal-rocm6_3-py3_10 docker-image-name: pytorch-linux-focal-rocm-n-py3 +======= + opt_out_experiments: lf + + linux-jammy-rocm-py3_10-inductor-benchmark-build: + if: github.repository_owner == 'pytorch' + name: rocm-py3_10-inductor-benchmark-build + uses: ./.github/workflows/_linux-build.yml + with: + build-environment: linux-jammy-rocm-py3_10 + docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) test-matrix: | { include: [ { config: "inductor_huggingface_perf_rocm", shard: 1, num_shards: 3, runner: "linux.rocm.gpu.mi300.2" }, @@ -102,6 +118,7 @@ jobs: ]} secrets: inherit +<<<<<<< HEAD linux-focal-rocm6_3-py3_10-inductor-benchmark-test: permissions: id-token: write @@ -117,4 +134,23 @@ jobs: timeout-minutes: 720 # Disable monitor in perf tests for more investigation disable-monitor: true +======= + linux-jammy-rocm-py3_10-inductor-benchmark-test: + permissions: + id-token: write + contents: read + name: rocm-py3_10-inductor-benchmark-test + uses: ./.github/workflows/_rocm-test.yml + needs: linux-jammy-rocm-py3_10-inductor-benchmark-build + with: + build-environment: linux-jammy-rocm-py3_10 + dashboard-tag: training-true-inference-true-default-true-dynamic-true-cudagraphs-true-cppwrapper-true-aotinductor-true-freezing_cudagraphs-true-cudagraphs_low_precision-true + docker-image: ${{ needs.linux-jammy-rocm-py3_10-inductor-benchmark-build.outputs.docker-image }} + test-matrix: ${{ needs.linux-jammy-rocm-py3_10-inductor-benchmark-build.outputs.test-matrix }} + timeout-minutes: 720 + # Disable monitor in perf tests for more investigation + disable-monitor: true + monitor-log-interval: 10 + monitor-data-collect-interval: 2 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) secrets: inherit diff --git a/.github/workflows/inductor-perf-test-nightly-x86-zen.yml b/.github/workflows/inductor-perf-test-nightly-x86-zen.yml new file mode 100644 index 000000000000..68418859a9b1 --- /dev/null +++ b/.github/workflows/inductor-perf-test-nightly-x86-zen.yml @@ -0,0 +1,129 @@ +name: inductor-perf-nightly-x86-zen + +on: + push: + tags: + - ciflow/inductor-perf-test-nightly-x86-zen/* + schedule: + # - cron: 0 7 * * 1-6 + # - cron: 0 7 * * 0 + # Does not perform max_autotune on CPU, so skip the weekly run setup + - cron: 0 7 * * * + # NB: GitHub has an upper limit of 10 inputs here + workflow_dispatch: + inputs: + training: + # CPU for training is not typical, but leave the option open here + description: Run training (off by default)? + required: false + type: boolean + default: false + inference: + description: Run inference (on by default)? + required: false + type: boolean + default: true + default: + description: Run inductor_default? + required: false + type: boolean + default: true + dynamic: + description: Run inductor_dynamic_shapes? + required: false + type: boolean + default: false + cppwrapper: + description: Run inductor_cpp_wrapper? + required: false + type: boolean + default: false + aotinductor: + description: Run aot_inductor for inference? + required: false + type: boolean + default: false + benchmark_configs: + description: The list of configs used the benchmark + required: false + type: string + default: inductor_huggingface_perf_zen_cpu_x86,inductor_timm_perf_zen_cpu_x86,inductor_torchbench_perf_zen_cpu_x86 + +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }} + cancel-in-progress: true + +permissions: read-all + +jobs: + get-label-type: + name: get-label-type + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8 + if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }} + with: + triggering_actor: ${{ github.triggering_actor }} + issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} + curr_branch: ${{ github.head_ref || github.ref_name }} + curr_ref_type: ${{ github.ref_type }} + opt_out_experiments: lf + + linux-jammy-zen-cpu-py3_9-gcc11-inductor-build: + name: linux-jammy-zen-cpu-py3.9-gcc11-inductor + uses: ./.github/workflows/_linux-build.yml + needs: get-label-type + with: + runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + build-environment: linux-jammy-py3.9-gcc11-build + docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks + test-matrix: | + { include: [ + { config: "inductor_huggingface_perf_zen_cpu_x86", shard: 1, num_shards: 3, runner: "linux.24xlarge.amd" }, + { config: "inductor_huggingface_perf_zen_cpu_x86", shard: 2, num_shards: 3, runner: "linux.24xlarge.amd" }, + { config: "inductor_huggingface_perf_zen_cpu_x86", shard: 3, num_shards: 3, runner: "linux.24xlarge.amd" }, + { config: "inductor_timm_perf_zen_cpu_x86", shard: 1, num_shards: 5, runner: "linux.24xlarge.amd" }, + { config: "inductor_timm_perf_zen_cpu_x86", shard: 2, num_shards: 5, runner: "linux.24xlarge.amd" }, + { config: "inductor_timm_perf_zen_cpu_x86", shard: 3, num_shards: 5, runner: "linux.24xlarge.amd" }, + { config: "inductor_timm_perf_zen_cpu_x86", shard: 4, num_shards: 5, runner: "linux.24xlarge.amd" }, + { config: "inductor_timm_perf_zen_cpu_x86", shard: 5, num_shards: 5, runner: "linux.24xlarge.amd" }, + { config: "inductor_torchbench_perf_zen_cpu_x86", shard: 1, num_shards: 4, runner: "linux.24xlarge.amd" }, + { config: "inductor_torchbench_perf_zen_cpu_x86", shard: 2, num_shards: 4, runner: "linux.24xlarge.amd" }, + { config: "inductor_torchbench_perf_zen_cpu_x86", shard: 3, num_shards: 4, runner: "linux.24xlarge.amd" }, + { config: "inductor_torchbench_perf_zen_cpu_x86", shard: 4, num_shards: 4, runner: "linux.24xlarge.amd" }, + ]} + selected-test-configs: ${{ inputs.benchmark_configs }} + secrets: inherit + + linux-jammy-zen-cpu-py3_9-gcc11-inductor-test-nightly: + name: linux-jammy-zen-cpu-py3.9-gcc11-inductor + uses: ./.github/workflows/_linux-test.yml + needs: linux-jammy-zen-cpu-py3_9-gcc11-inductor-build + if: github.event.schedule == '0 7 * * *' + with: + build-environment: linux-jammy-py3.9-gcc11-build + dashboard-tag: training-false-inference-true-default-true-dynamic-true-cppwrapper-true-aotinductor-true + docker-image: ${{ needs.linux-jammy-zen-cpu-py3_9-gcc11-inductor-build.outputs.docker-image }} + test-matrix: ${{ needs.linux-jammy-zen-cpu-py3_9-gcc11-inductor-build.outputs.test-matrix }} + timeout-minutes: 720 + # disable monitor in perf tests + disable-monitor: false + monitor-log-interval: 15 + monitor-data-collect-interval: 4 + secrets: inherit + + + linux-jammy-zen-cpu-py3_9-gcc11-inductor-test: + name: linux-jammy-zen-cpu-py3.9-gcc11-inductor + uses: ./.github/workflows/_linux-test.yml + needs: linux-jammy-zen-cpu-py3_9-gcc11-inductor-build + if: github.event_name == 'workflow_dispatch' + with: + build-environment: linux-jammy-py3.9-gcc11-build + dashboard-tag: training-${{ inputs.training }}-inference-${{ inputs.inference }}-default-${{ inputs.default }}-dynamic-${{ inputs.dynamic }}-cppwrapper-${{ inputs.cppwrapper }}-aotinductor-${{ inputs.aotinductor }} + docker-image: ${{ needs.linux-jammy-zen-cpu-py3_9-gcc11-inductor-build.outputs.docker-image }} + test-matrix: ${{ needs.linux-jammy-zen-cpu-py3_9-gcc11-inductor-build.outputs.test-matrix }} + timeout-minutes: 720 + # disable monitor in perf tests + disable-monitor: false + monitor-log-interval: 15 + monitor-data-collect-interval: 4 + secrets: inherit diff --git a/.github/workflows/inductor-perf-test-nightly-x86.yml b/.github/workflows/inductor-perf-test-nightly-x86.yml index 7db8089fd5f6..e0eb134fd117 100644 --- a/.github/workflows/inductor-perf-test-nightly-x86.yml +++ b/.github/workflows/inductor-perf-test-nightly-x86.yml @@ -55,13 +55,21 @@ permissions: read-all jobs: get-label-type: name: get-label-type +<<<<<<< HEAD uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.7 +======= + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }} with: triggering_actor: ${{ github.triggering_actor }} issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} curr_branch: ${{ github.head_ref || github.ref_name }} curr_ref_type: ${{ github.ref_type }} +<<<<<<< HEAD +======= + opt_out_experiments: lf +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) linux-jammy-cpu-py3_9-gcc11-inductor-build: name: linux-jammy-cpu-py3.9-gcc11-inductor @@ -70,7 +78,11 @@ jobs: with: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build-environment: linux-jammy-py3.9-gcc11-build +<<<<<<< HEAD docker-image-name: pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks +======= + docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) test-matrix: | { include: [ { config: "inductor_huggingface_perf_cpu_x86", shard: 1, num_shards: 3, runner: "linux.24xl.spr-metal" }, @@ -101,8 +113,15 @@ jobs: docker-image: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.docker-image }} test-matrix: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.test-matrix }} timeout-minutes: 720 +<<<<<<< HEAD # disable monitor in perf tests for more investigation disable-monitor: true +======= + # disable monitor in perf tests + disable-monitor: false + monitor-log-interval: 15 + monitor-data-collect-interval: 4 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) secrets: inherit @@ -117,6 +136,13 @@ jobs: docker-image: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.docker-image }} test-matrix: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.test-matrix }} timeout-minutes: 720 +<<<<<<< HEAD # disable monitor in perf tests for more investigation disable-monitor: true +======= + # disable monitor in perf tests + disable-monitor: false + monitor-log-interval: 15 + monitor-data-collect-interval: 4 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) secrets: inherit diff --git a/.github/workflows/inductor-perf-test-nightly.yml b/.github/workflows/inductor-perf-test-nightly.yml index 5541bfe22ac6..f905d381e83e 100644 --- a/.github/workflows/inductor-perf-test-nightly.yml +++ b/.github/workflows/inductor-perf-test-nightly.yml @@ -68,23 +68,40 @@ permissions: read-all jobs: get-label-type: name: get-label-type +<<<<<<< HEAD uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.7 +======= + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }} with: triggering_actor: ${{ github.triggering_actor }} issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} curr_branch: ${{ github.head_ref || github.ref_name }} curr_ref_type: ${{ github.ref_type }} +<<<<<<< HEAD # NB: Keep this in sync with trunk.yml linux-focal-cuda12_6-py3_10-gcc9-inductor-build: name: cuda12.6-py3.10-gcc9-sm80 +======= + opt_out_experiments: lf + + # NB: Keep this in sync with trunk.yml + build: + name: cuda12.8-py3.10-gcc9-sm80 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) uses: ./.github/workflows/_linux-build.yml needs: get-label-type with: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" +<<<<<<< HEAD build-environment: linux-focal-cuda12.6-py3.10-gcc9-sm80 docker-image-name: pytorch-linux-focal-cuda12.6-cudnn9-py3-gcc9-inductor-benchmarks +======= + build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80 + docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) cuda-arch-list: '8.0' test-matrix: | { include: [ @@ -111,6 +128,7 @@ jobs: selected-test-configs: ${{ inputs.benchmark_configs }} secrets: inherit +<<<<<<< HEAD linux-focal-cuda12_6-py3_10-gcc9-inductor-test-nightly: name: cuda12.6-py3.10-gcc9-sm80 uses: ./.github/workflows/_linux-test.yml @@ -154,4 +172,53 @@ jobs: timeout-minutes: 720 # disable monitor in perf tests for more investigation disable-monitor: true +======= + test-nightly: + name: cuda12.8-py3.10-gcc9-sm80 + uses: ./.github/workflows/_linux-test.yml + needs: build + if: github.event.schedule == '0 7 * * 1-6' + with: + build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80 + dashboard-tag: training-true-inference-true-default-true-dynamic-true-cudagraphs-true-cppwrapper-true-aotinductor-true-freezing_cudagraphs-true-cudagraphs_low_precision-true + docker-image: ${{ needs.build.outputs.docker-image }} + test-matrix: ${{ needs.build.outputs.test-matrix }} + timeout-minutes: 720 + disable-monitor: false + monitor-log-interval: 15 + monitor-data-collect-interval: 4 + secrets: inherit + + test-weekly: + name: cuda12.8-py3.10-gcc9-sm80 + uses: ./.github/workflows/_linux-test.yml + needs: build + if: github.event.schedule == '0 7 * * 0' + with: + build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80 + dashboard-tag: training-true-inference-true-default-true-dynamic-true-cudagraphs-true-cppwrapper-true-aotinductor-true-freezing_cudagraphs-true-maxautotune-true-freeze_autotune_cudagraphs-true-cudagraphs_low_precision-true + docker-image: ${{ needs.build.outputs.docker-image }} + test-matrix: ${{ needs.build.outputs.test-matrix }} + timeout-minutes: 1440 + # disable monitor in perf tests, next step is to enable it + disable-monitor: false + monitor-log-interval: 15 + monitor-data-collect-interval: 4 + secrets: inherit + + test: + name: cuda12.8-py3.10-gcc9-sm80 + uses: ./.github/workflows/_linux-test.yml + needs: build + if: github.event_name == 'workflow_dispatch' + with: + build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80 + dashboard-tag: training-${{ inputs.training }}-inference-${{ inputs.inference }}-default-${{ inputs.default }}-dynamic-${{ inputs.dynamic }}-cudagraphs-${{ inputs.cudagraphs }}-cppwrapper-${{ inputs.cppwrapper }}-aotinductor-${{ inputs.aotinductor }}-maxautotune-${{ inputs.maxautotune }}-freezing_cudagraphs-${{ inputs.freezing_cudagraphs }}-cudagraphs_low_precision-${{ inputs.cudagraphs }} + docker-image: ${{ needs.build.outputs.docker-image }} + test-matrix: ${{ needs.build.outputs.test-matrix }} + timeout-minutes: 720 + disable-monitor: false + monitor-log-interval: 15 + monitor-data-collect-interval: 4 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) secrets: inherit diff --git a/.github/workflows/inductor-periodic.yml b/.github/workflows/inductor-periodic.yml index ada7139a81a2..708d30f9f18a 100644 --- a/.github/workflows/inductor-periodic.yml +++ b/.github/workflows/inductor-periodic.yml @@ -20,22 +20,38 @@ permissions: read-all jobs: get-default-label-prefix: name: get-default-label-prefix +<<<<<<< HEAD uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.7 +======= + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }} with: triggering_actor: ${{ github.triggering_actor }} issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} curr_branch: ${{ github.head_ref || github.ref_name }} curr_ref_type: ${{ github.ref_type }} +<<<<<<< HEAD linux-focal-cuda12_6-py3_10-gcc9-periodic-dynamo-benchmarks-build: name: cuda12.6-py3.10-gcc9-sm86-periodic-dynamo-benchmarks +======= + opt_out_experiments: lf + + linux-jammy-cuda12_8-py3_10-gcc9-periodic-dynamo-benchmarks-build: + name: cuda12.8-py3.10-gcc9-sm86-periodic-dynamo-benchmarks +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) uses: ./.github/workflows/_linux-build.yml needs: get-default-label-prefix with: runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}" +<<<<<<< HEAD build-environment: linux-focal-cuda12.6-py3.10-gcc9-sm86 docker-image-name: pytorch-linux-focal-cuda12.6-cudnn9-py3-gcc9-inductor-benchmarks +======= + build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm86 + docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) cuda-arch-list: '8.6' test-matrix: | { include: [ @@ -57,6 +73,7 @@ jobs: ]} secrets: inherit +<<<<<<< HEAD linux-focal-cuda12_6-py3_10-gcc9-periodic-dynamo-benchmarks-test: name: cuda12.6-py3.10-gcc9-sm86-periodic-dynamo-benchmarks uses: ./.github/workflows/_linux-test.yml @@ -74,6 +91,25 @@ jobs: with: build-environment: linux-focal-rocm6_3-py3_10 docker-image-name: pytorch-linux-focal-rocm-n-py3 +======= + linux-jammy-cuda12_8-py3_10-gcc9-periodic-dynamo-benchmarks-test: + name: cuda12.8-py3.10-gcc9-sm86-periodic-dynamo-benchmarks + uses: ./.github/workflows/_linux-test.yml + needs: linux-jammy-cuda12_8-py3_10-gcc9-periodic-dynamo-benchmarks-build + with: + build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm86 + docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-periodic-dynamo-benchmarks-build.outputs.docker-image }} + test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-periodic-dynamo-benchmarks-build.outputs.test-matrix }} + secrets: inherit + + linux-jammy-rocm-py3_10-periodic-dynamo-benchmarks-build: + if: github.repository_owner == 'pytorch' + name: rocm-py3_10-periodic-dynamo-benchmarks + uses: ./.github/workflows/_linux-build.yml + with: + build-environment: linux-jammy-rocm-py3_10 + docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) sync-tag: rocm-build test-matrix: | { include: [ @@ -95,6 +131,7 @@ jobs: ]} secrets: inherit +<<<<<<< HEAD linux-focal-rocm6_3-py3_10-periodic-dynamo-benchmarks-test: permissions: id-token: write @@ -110,13 +147,35 @@ jobs: linux-focal-cuda12_6-py3_10-gcc9-inductor-build-gcp: name: cuda12.6-py3.10-gcc9-sm80 +======= + linux-jammy-rocm-py3_10-periodic-dynamo-benchmarks-test: + permissions: + id-token: write + contents: read + name: rocm-py3_10-periodic-dynamo-benchmarks + uses: ./.github/workflows/_rocm-test.yml + needs: linux-jammy-rocm-py3_10-periodic-dynamo-benchmarks-build + with: + build-environment: linux-jammy-rocm-py3_10 + docker-image: ${{ needs.linux-jammy-rocm-py3_10-periodic-dynamo-benchmarks-build.outputs.docker-image }} + test-matrix: ${{ needs.linux-jammy-rocm-py3_10-periodic-dynamo-benchmarks-build.outputs.test-matrix }} + secrets: inherit + + linux-jammy-cuda12_8-py3_10-gcc9-inductor-smoke-build: + name: cuda12.8-py3.10-gcc9-sm80 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) uses: ./.github/workflows/_linux-build.yml needs: - get-default-label-prefix with: runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}" +<<<<<<< HEAD build-environment: linux-focal-cuda12.6-py3.10-gcc9-sm80 docker-image-name: pytorch-linux-focal-cuda12.6-cudnn9-py3-gcc9-inductor-benchmarks +======= + build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80 + docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) cuda-arch-list: '8.0' test-matrix: | { include: [ @@ -124,6 +183,7 @@ jobs: ]} secrets: inherit +<<<<<<< HEAD linux-focal-cuda12_6-py3_10-gcc9-inductor-test-gcp: name: cuda12.6-py3.10-gcc9-sm80 uses: ./.github/workflows/_linux-test.yml @@ -134,6 +194,16 @@ jobs: test-matrix: ${{ needs.linux-focal-cuda12_6-py3_10-gcc9-inductor-build-gcp.outputs.test-matrix }} # disable monitor in smoke perf tests for more investigation disable-monitor: true +======= + linux-jammy-cuda12_8-py3_10-gcc9-inductor-smoke-test: + name: cuda12.8-py3.10-gcc9-sm80 + uses: ./.github/workflows/_linux-test.yml + needs: linux-jammy-cuda12_8-py3_10-gcc9-inductor-smoke-build + with: + build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80 + docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-smoke-build.outputs.docker-image }} + test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-smoke-build.outputs.test-matrix }} +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) secrets: inherit linux-jammy-cpu-py3_9-gcc11-periodic-dynamo-benchmarks-build: @@ -142,7 +212,11 @@ jobs: needs: get-default-label-prefix with: build-environment: linux-jammy-py3.9-gcc11-build +<<<<<<< HEAD docker-image-name: pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks +======= + docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}" test-matrix: | { include: [ @@ -171,6 +245,7 @@ jobs: secrets: inherit +<<<<<<< HEAD linux-focal-cuda12_6-py3_10-gcc9-inductor-build: name: cuda12.6-py3.10-gcc9-sm86 uses: ./.github/workflows/_linux-build.yml @@ -181,6 +256,18 @@ jobs: cuda-arch-list: '8.6' runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}" sync-tag: linux-focal-cuda12_6-py3_10-gcc9-inductor-build +======= + linux-jammy-cuda12_8-py3_10-gcc9-inductor-build: + name: cuda12.8-py3.10-gcc9-sm86 + uses: ./.github/workflows/_linux-build.yml + needs: get-default-label-prefix + with: + build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm86 + docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks + cuda-arch-list: '8.6' + runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}" + sync-tag: linux-jammy-cuda12_8-py3_10-gcc9-inductor-build +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) test-matrix: | { include: [ { config: "dynamic_inductor_huggingface", shard: 1, num_shards: 1, runner: "linux.g5.4xlarge.nvidia.gpu" }, @@ -196,6 +283,7 @@ jobs: ]} secrets: inherit +<<<<<<< HEAD linux-focal-cuda12_6-py3_10-gcc9-inductor-test: name: cuda12.6-py3.10-gcc9-sm86 uses: ./.github/workflows/_linux-test.yml @@ -204,6 +292,16 @@ jobs: build-environment: linux-focal-cuda12.6-py3.10-gcc9-sm86 docker-image: ${{ needs.linux-focal-cuda12_6-py3_10-gcc9-inductor-build.outputs.docker-image }} test-matrix: ${{ needs.linux-focal-cuda12_6-py3_10-gcc9-inductor-build.outputs.test-matrix }} +======= + linux-jammy-cuda12_8-py3_10-gcc9-inductor-test: + name: cuda12.8-py3.10-gcc9-sm86 + uses: ./.github/workflows/_linux-test.yml + needs: linux-jammy-cuda12_8-py3_10-gcc9-inductor-build + with: + build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm86 + docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-build.outputs.docker-image }} + test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-build.outputs.test-matrix }} +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) secrets: inherit linux-jammy-cpu-py3_9-gcc11-inductor-build: @@ -212,7 +310,11 @@ jobs: needs: get-default-label-prefix with: build-environment: linux-jammy-py3.9-gcc11-build +<<<<<<< HEAD docker-image-name: pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks +======= + docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}" sync-tag: linux-jammy-cpu-py3_9-gcc11-inductor-build test-matrix: | diff --git a/.github/workflows/inductor-rocm-mi300.yml b/.github/workflows/inductor-rocm-mi300.yml index da19dde06b78..3fbaf2da74c9 100644 --- a/.github/workflows/inductor-rocm-mi300.yml +++ b/.github/workflows/inductor-rocm-mi300.yml @@ -28,22 +28,38 @@ jobs: get-label-type: name: get-label-type +<<<<<<< HEAD uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.7 +======= + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }} with: triggering_actor: ${{ github.triggering_actor }} issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} curr_branch: ${{ github.head_ref || github.ref_name }} curr_ref_type: ${{ github.ref_type }} +<<<<<<< HEAD linux-focal-rocm6_3-py3_10-inductor-build: name: rocm6.3-py3.10-inductor +======= + opt_out_experiments: lf + + linux-jammy-rocm-py3_10-inductor-build: + name: rocm-py3.10-inductor-mi300 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) uses: ./.github/workflows/_linux-build.yml needs: get-label-type with: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" +<<<<<<< HEAD build-environment: linux-focal-rocm6.3-py3.10 docker-image-name: pytorch-linux-focal-rocm-n-py3 +======= + build-environment: linux-jammy-rocm-py3.10-mi300 + docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) test-matrix: | { include: [ { config: "inductor", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" }, @@ -51,6 +67,7 @@ jobs: ]} secrets: inherit +<<<<<<< HEAD linux-focal-rocm6_3-py3_10-inductor-test: permissions: id-token: write @@ -62,4 +79,17 @@ jobs: build-environment: linux-focal-rocm6.3-py3.10 docker-image: ${{ needs.linux-focal-rocm6_3-py3_10-inductor-build.outputs.docker-image }} test-matrix: ${{ needs.linux-focal-rocm6_3-py3_10-inductor-build.outputs.test-matrix }} +======= + linux-jammy-rocm-py3_10-inductor-test: + permissions: + id-token: write + contents: read + name: rocm-py3.10-inductor-mi300 + uses: ./.github/workflows/_rocm-test.yml + needs: linux-jammy-rocm-py3_10-inductor-build + with: + build-environment: linux-jammy-rocm-py3.10-mi300 + docker-image: ${{ needs.linux-jammy-rocm-py3_10-inductor-build.outputs.docker-image }} + test-matrix: ${{ needs.linux-jammy-rocm-py3_10-inductor-build.outputs.test-matrix }} +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) secrets: inherit diff --git a/.github/workflows/inductor-rocm.yml b/.github/workflows/inductor-rocm.yml index b224f3c68827..eaa7a2794691 100644 --- a/.github/workflows/inductor-rocm.yml +++ b/.github/workflows/inductor-rocm.yml @@ -21,22 +21,38 @@ permissions: jobs: get-label-type: name: get-label-type +<<<<<<< HEAD uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.7 +======= + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }} with: triggering_actor: ${{ github.triggering_actor }} issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} curr_branch: ${{ github.head_ref || github.ref_name }} curr_ref_type: ${{ github.ref_type }} +<<<<<<< HEAD linux-focal-rocm6_3-py3_10-inductor-build: name: rocm6.3-py3.10-inductor +======= + opt_out_experiments: lf + + linux-jammy-rocm-py3_10-inductor-build: + name: rocm-py3.10-inductor +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) uses: ./.github/workflows/_linux-build.yml needs: get-label-type with: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" +<<<<<<< HEAD build-environment: linux-focal-rocm6.3-py3.10 docker-image-name: pytorch-linux-focal-rocm-n-py3 +======= + build-environment: linux-jammy-rocm-py3.10 + docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) test-matrix: | { include: [ { config: "inductor", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.2" }, @@ -44,6 +60,7 @@ jobs: ]} secrets: inherit +<<<<<<< HEAD linux-focal-rocm6_3-py3_10-inductor-test: permissions: id-token: write @@ -55,4 +72,17 @@ jobs: build-environment: linux-focal-rocm6.3-py3.10 docker-image: ${{ needs.linux-focal-rocm6_3-py3_10-inductor-build.outputs.docker-image }} test-matrix: ${{ needs.linux-focal-rocm6_3-py3_10-inductor-build.outputs.test-matrix }} +======= + linux-jammy-rocm-py3_10-inductor-test: + permissions: + id-token: write + contents: read + name: rocm-py3.10-inductor + uses: ./.github/workflows/_rocm-test.yml + needs: linux-jammy-rocm-py3_10-inductor-build + with: + build-environment: linux-jammy-rocm-py3.10 + docker-image: ${{ needs.linux-jammy-rocm-py3_10-inductor-build.outputs.docker-image }} + test-matrix: ${{ needs.linux-jammy-rocm-py3_10-inductor-build.outputs.test-matrix }} +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) secrets: inherit diff --git a/.github/workflows/inductor-unittest.yml b/.github/workflows/inductor-unittest.yml index ffc32540931b..cc414fb8af0f 100644 --- a/.github/workflows/inductor-unittest.yml +++ b/.github/workflows/inductor-unittest.yml @@ -1,6 +1,10 @@ # Workflow: Inductor Unit Test # 1. runs unit tests for inductor. +<<<<<<< HEAD # 2. perfoms daily memory leak checks and reruns of disabled tests, scheduled at `29 8 * * *`. +======= +# 2. performs daily memory leak checks and reruns of disabled tests, scheduled at `29 8 * * *`. +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) name: inductor-unittest on: @@ -17,13 +21,18 @@ permissions: read-all jobs: get-label-type: name: get-label-type +<<<<<<< HEAD uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.7 +======= + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }} with: triggering_actor: ${{ github.triggering_actor }} issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} curr_branch: ${{ github.head_ref || github.ref_name }} curr_ref_type: ${{ github.ref_type }} +<<<<<<< HEAD linux-focal-cuda12_6-py3_10-gcc9-inductor-build: name: cuda12.6-py3.10-gcc9-sm86 @@ -32,10 +41,22 @@ jobs: with: build-environment: linux-focal-cuda12.6-py3.10-gcc9-sm86 docker-image-name: pytorch-linux-focal-cuda12.6-cudnn9-py3-gcc9-inductor-benchmarks +======= + opt_out_experiments: lf + + linux-jammy-cuda12_8-py3_10-gcc9-inductor-build: + name: cuda12.8-py3.10-gcc9-sm86 + uses: ./.github/workflows/_linux-build.yml + needs: get-label-type + with: + build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm86 + docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) cuda-arch-list: '8.6' runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" test-matrix: | { include: [ +<<<<<<< HEAD { config: "inductor", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" }, { config: "inductor", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" }, { config: "inductor_distributed", shard: 1, num_shards: 1, runner: "linux.g5.12xlarge.nvidia.gpu" }, @@ -61,10 +82,38 @@ jobs: with: build-environment: linux-focal-cuda12.6-py3.12-gcc9-sm86 docker-image-name: pytorch-linux-focal-cuda12.6-cudnn9-py3.12-gcc9-inductor-benchmarks +======= + { config: "inductor", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" }, + { config: "inductor", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" }, + { config: "inductor_distributed", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.12xlarge.nvidia.gpu" }, + { config: "inductor_cpp_wrapper", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" }, + { config: "inductor_cpp_wrapper", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" }, + ]} + secrets: inherit + + linux-jammy-cuda12_8-py3_10-gcc9-inductor-test: + name: cuda12.8-py3.10-gcc9-sm86 + uses: ./.github/workflows/_linux-test.yml + needs: linux-jammy-cuda12_8-py3_10-gcc9-inductor-build + with: + build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm86 + docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-build.outputs.docker-image }} + test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-build.outputs.test-matrix }} + secrets: inherit + + linux-jammy-cuda12_8-py3_12-gcc9-inductor-build: + name: cuda12.8-py3.12-gcc9-sm86 + uses: ./.github/workflows/_linux-build.yml + needs: get-label-type + with: + build-environment: linux-jammy-cuda12.8-py3.12-gcc9-sm86 + docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3.12-gcc9-inductor-benchmarks +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) cuda-arch-list: '8.6' runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" test-matrix: | { include: [ +<<<<<<< HEAD { config: "inductor", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" }, { config: "inductor", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" }, ]} @@ -78,6 +127,21 @@ jobs: build-environment: linux-focal-cuda12.6-py3.12-gcc9-sm86 docker-image: ${{ needs.linux-focal-cuda12_6-py3_12-gcc9-inductor-build.outputs.docker-image }} test-matrix: ${{ needs.linux-focal-cuda12_6-py3_12-gcc9-inductor-build.outputs.test-matrix }} +======= + { config: "inductor", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" }, + { config: "inductor", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" }, + ]} + secrets: inherit + + linux-jammy-cuda12_8-py3_12-gcc9-inductor-test: + name: cuda12.8-py3.12-gcc9-sm86 + uses: ./.github/workflows/_linux-test.yml + needs: linux-jammy-cuda12_8-py3_12-gcc9-inductor-build + with: + build-environment: linux-jammy-cuda12.8-py3.12-gcc9-sm86 + docker-image: ${{ needs.linux-jammy-cuda12_8-py3_12-gcc9-inductor-build.outputs.docker-image }} + test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_12-gcc9-inductor-build.outputs.test-matrix }} +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) secrets: inherit linux-jammy-cpu-py3_12-inductor-halide-build: @@ -86,11 +150,19 @@ jobs: needs: get-label-type with: build-environment: linux-jammy-py3.12-gcc11 +<<<<<<< HEAD docker-image-name: pytorch-linux-jammy-py3.12-halide runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" test-matrix: | { include: [ { config: "inductor-halide", shard: 1, num_shards: 1, runner: "linux.12xlarge" }, +======= + docker-image-name: ci-image:pytorch-linux-jammy-py3.12-halide + runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + test-matrix: | + { include: [ + { config: "inductor-halide", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.12xlarge" }, +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ]} secrets: inherit @@ -110,11 +182,19 @@ jobs: needs: get-label-type with: build-environment: linux-jammy-py3.12-gcc11 +<<<<<<< HEAD docker-image-name: pytorch-linux-jammy-py3.12-triton-cpu runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" test-matrix: | { include: [ { config: "inductor-triton-cpu", shard: 1, num_shards: 1, runner: "linux.12xlarge" }, +======= + docker-image-name: ci-image:pytorch-linux-jammy-py3.12-triton-cpu + runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + test-matrix: | + { include: [ + { config: "inductor-triton-cpu", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.12xlarge" }, +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ]} secrets: inherit @@ -134,6 +214,7 @@ jobs: needs: get-label-type with: build-environment: linux-jammy-py3.9-gcc11-build +<<<<<<< HEAD docker-image-name: pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" test-matrix: | @@ -142,6 +223,16 @@ jobs: { config: "inductor_amx", shard: 2, num_shards: 2, runner: "linux.8xlarge.amx" }, { config: "inductor_avx2", shard: 1, num_shards: 2, runner: "linux.10xlarge.avx2" }, { config: "inductor_avx2", shard: 2, num_shards: 2, runner: "linux.10xlarge.avx2" }, +======= + docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks + runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + test-matrix: | + { include: [ + { config: "inductor_amx", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.8xlarge.amx" }, + { config: "inductor_amx", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.8xlarge.amx" }, + { config: "inductor_avx2", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.10xlarge.avx2" }, + { config: "inductor_avx2", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.10xlarge.avx2" }, +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ]} secrets: inherit @@ -155,6 +246,7 @@ jobs: test-matrix: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.test-matrix }} secrets: inherit +<<<<<<< HEAD linux-focal-cuda12_6-py3_13-gcc9-inductor-build: name: cuda12.6-py3.13-gcc9-sm86 uses: ./.github/workflows/_linux-build.yml @@ -178,4 +270,29 @@ jobs: build-environment: linux-focal-cuda12.6-py3.13-gcc9-sm86 docker-image: ${{ needs.linux-focal-cuda12_6-py3_13-gcc9-inductor-build.outputs.docker-image }} test-matrix: ${{ needs.linux-focal-cuda12_6-py3_13-gcc9-inductor-build.outputs.test-matrix }} +======= + linux-jammy-cuda12_8-py3_13-gcc9-inductor-build: + name: cuda12.8-py3.13-gcc9-sm86 + uses: ./.github/workflows/_linux-build.yml + needs: get-label-type + with: + build-environment: linux-jammy-cuda12.8-py3.13-gcc9-sm86 + docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3.13-gcc9-inductor-benchmarks + cuda-arch-list: '8.6' + test-matrix: | + { include: [ + { config: "inductor", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" }, + { config: "inductor", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" }, + ]} + secrets: inherit + + linux-jammy-cuda12_8-py3_13-gcc9-inductor-test: + name: cuda12.8-py3.13-gcc9-sm86 + uses: ./.github/workflows/_linux-test.yml + needs: linux-jammy-cuda12_8-py3_13-gcc9-inductor-build + with: + build-environment: linux-jammy-cuda12.8-py3.13-gcc9-sm86 + docker-image: ${{ needs.linux-jammy-cuda12_8-py3_13-gcc9-inductor-build.outputs.docker-image }} + test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_13-gcc9-inductor-build.outputs.test-matrix }} +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) secrets: inherit diff --git a/.github/workflows/inductor.yml b/.github/workflows/inductor.yml index 0cccdd96a67f..5e3c466b5333 100644 --- a/.github/workflows/inductor.yml +++ b/.github/workflows/inductor.yml @@ -33,13 +33,18 @@ jobs: get-label-type: name: get-label-type +<<<<<<< HEAD uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.7 +======= + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }} with: triggering_actor: ${{ github.triggering_actor }} issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} curr_branch: ${{ github.head_ref || github.ref_name }} curr_ref_type: ${{ github.ref_type }} +<<<<<<< HEAD linux-focal-cuda12_6-py3_10-gcc9-inductor-build: name: cuda12.6-py3.10-gcc9-sm86 @@ -69,6 +74,38 @@ jobs: build-environment: linux-focal-cuda12.6-py3.10-gcc9-sm86 docker-image: ${{ needs.linux-focal-cuda12_6-py3_10-gcc9-inductor-build.outputs.docker-image }} test-matrix: ${{ needs.linux-focal-cuda12_6-py3_10-gcc9-inductor-build.outputs.test-matrix }} +======= + opt_out_experiments: lf + + linux-jammy-cuda12_8-py3_10-gcc9-inductor-build: + name: cuda12.8-py3.10-gcc9-sm86 + uses: ./.github/workflows/_linux-build.yml + needs: get-label-type + with: + build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm86 + docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks + cuda-arch-list: '8.6' + runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + sync-tag: linux-jammy-cuda12_8-py3_10-gcc9-inductor-build + test-matrix: | + { include: [ + { config: "inductor_huggingface", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" }, + { config: "inductor_timm", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" }, + { config: "inductor_timm", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" }, + { config: "inductor_torchbench", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" }, + { config: "inductor_torchbench", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" }, + ]} + secrets: inherit + + linux-jammy-cuda12_8-py3_10-gcc9-inductor-test: + name: cuda12.8-py3.10-gcc9-sm86 + uses: ./.github/workflows/_linux-test.yml + needs: linux-jammy-cuda12_8-py3_10-gcc9-inductor-build + with: + build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm86 + docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-build.outputs.docker-image }} + test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-build.outputs.test-matrix }} +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) secrets: inherit linux-jammy-cpu-py3_9-gcc11-inductor-build: @@ -77,11 +114,16 @@ jobs: needs: get-label-type with: build-environment: linux-jammy-py3.9-gcc11-build +<<<<<<< HEAD docker-image-name: pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks +======= + docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" sync-tag: linux-jammy-cpu-py3_9-gcc11-inductor-build test-matrix: | { include: [ +<<<<<<< HEAD { config: "cpu_inductor_torchbench", shard: 1, num_shards: 2, runner: "linux.8xlarge.amx" }, { config: "cpu_inductor_torchbench", shard: 2, num_shards: 2, runner: "linux.8xlarge.amx" }, { config: "dynamic_cpu_inductor_huggingface", shard: 1, num_shards: 1, runner: "linux.8xlarge.amx" }, @@ -90,6 +132,16 @@ jobs: { config: "dynamic_cpu_inductor_torchbench", shard: 1, num_shards: 2, runner: "linux.8xlarge.amx" }, { config: "dynamic_cpu_inductor_torchbench", shard: 2, num_shards: 2, runner: "linux.8xlarge.amx" }, { config: "inductor_torchbench_cpu_smoketest_perf", shard: 1, num_shards: 1, runner: "linux.24xl.spr-metal" }, +======= + { config: "cpu_inductor_torchbench", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.8xlarge.amx" }, + { config: "cpu_inductor_torchbench", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.8xlarge.amx" }, + { config: "dynamic_cpu_inductor_huggingface", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.8xlarge.amx" }, + { config: "dynamic_cpu_inductor_timm", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.8xlarge.amx" }, + { config: "dynamic_cpu_inductor_timm", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.8xlarge.amx" }, + { config: "dynamic_cpu_inductor_torchbench", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.8xlarge.amx" }, + { config: "dynamic_cpu_inductor_torchbench", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.8xlarge.amx" }, + { config: "inductor_torchbench_cpu_smoketest_perf", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.24xl.spr-metal" }, +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ]} secrets: inherit diff --git a/.github/workflows/lint-autoformat.yml b/.github/workflows/lint-autoformat.yml index bf68a0877b90..89b9ce15f10c 100644 --- a/.github/workflows/lint-autoformat.yml +++ b/.github/workflows/lint-autoformat.yml @@ -1,10 +1,15 @@ name: Apply lint suggestions on: +<<<<<<< HEAD push: tags: - ciflow/autoformat/* +======= + pull_request: + types: [opened, synchronize, reopened, labeled, unlabeled] +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) jobs: lintrunner-autoformat: @@ -12,6 +17,7 @@ jobs: contents: read pull-requests: write runs-on: lf.linux.2xlarge +<<<<<<< HEAD if: ${{ github.repository_owner == 'pytorch' && github.event.pull_request.user.login != 'ezyang' && github.event.pull_request.user.login != 'malfet' && !startsWith(github.head_ref, 'export-') }} steps: - name: Checkout pytorch @@ -28,6 +34,22 @@ jobs: # we can't run all files here because only changes around where the diff are shown in the PR UI run: | export ADDITIONAL_LINTRUNNER_ARGS="format" +======= + if: ${{ github.repository_owner == 'pytorch' && contains(github.event.pull_request.labels.*.name, 'autoformat') }} + steps: + - name: Checkout pytorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.8 + with: + submodules: true + fetch-depth: 0 + - name: Run lintrunner (nonretryable) + continue-on-error: true + run: | + set -ex + python3 -m venv /tmp/venv + source /tmp/venv/bin/activate + export ADDITIONAL_LINTRUNNER_ARGS="format --all-files" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) bash .github/scripts/lintrunner.sh - name: Check for changes id: git-check @@ -37,7 +59,11 @@ jobs: - name: Suggest changes if: steps.git-check.outputs.changes == 'true' continue-on-error: true +<<<<<<< HEAD uses: parkerbxyz/suggest-changes@v1 +======= + uses: parkerbxyz/suggest-changes@a2ec1653b0c4cc8287d682f0066dba4a173cc7f3 # v1.0.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: comment: "Please commit the suggested changes from pytorch's linter." diff --git a/.github/workflows/lint-bc.yml b/.github/workflows/lint-bc.yml index 64ed12e9c5b8..b0d55d6a3a11 100644 --- a/.github/workflows/lint-bc.yml +++ b/.github/workflows/lint-bc.yml @@ -20,7 +20,11 @@ jobs: runs-on: ubuntu-latest steps: - name: Run BC Lint Action +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/bc-lint@release/2.7 +======= + uses: pytorch/test-infra/.github/actions/bc-lint@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: repo: ${{ github.event.pull_request.head.repo.full_name }} base_sha: ${{ github.event.pull_request.base.sha }} diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index 82d2532fa995..efc18675fce7 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -10,6 +10,11 @@ on: - main - release/* - landchecks/* +<<<<<<< HEAD +======= + tags: + - ciflow/pull/* +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) workflow_dispatch: permissions: read-all @@ -19,19 +24,31 @@ jobs: get-label-type: if: github.repository_owner == 'pytorch' name: get-label-type +<<<<<<< HEAD uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.7 +======= + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: triggering_actor: ${{ github.triggering_actor }} issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} curr_branch: ${{ github.head_ref || github.ref_name }} lintrunner-clang: +<<<<<<< HEAD uses: pytorch/test-infra/.github/workflows/linux_job.yml@release/2.7 +======= + uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) needs: get-label-type with: timeout: 120 runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" +<<<<<<< HEAD docker-image: pytorch-linux-jammy-cuda11.8-cudnn9-py3.9-linter +======= + docker-image: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3.9-linter +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # NB: A shallow checkout won't work here because calculate-docker-image requires a full checkout # to run git rev-parse HEAD~:.ci/docker when a new image is needed fetch-depth: 0 @@ -43,12 +60,20 @@ jobs: .github/scripts/lintrunner.sh lintrunner-noclang: +<<<<<<< HEAD uses: pytorch/test-infra/.github/workflows/linux_job.yml@release/2.7 +======= + uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) needs: get-label-type with: timeout: 120 runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" +<<<<<<< HEAD docker-image: pytorch-linux-jammy-cuda11.8-cudnn9-py3.9-linter +======= + docker-image: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3.9-linter +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # NB: A shallow checkout won't work here because calculate-docker-image requires a full checkout # to run git rev-parse HEAD~:.ci/docker when a new image is needed fetch-depth: 0 @@ -59,11 +84,16 @@ jobs: .github/scripts/lintrunner.sh quick-checks: +<<<<<<< HEAD uses: pytorch/test-infra/.github/workflows/linux_job.yml@release/2.7 +======= + uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) needs: get-label-type with: timeout: 120 runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" +<<<<<<< HEAD docker-image: pytorch-linux-focal-linter fetch-depth: 0 ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} @@ -72,6 +102,12 @@ jobs: CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]") conda activate "${CONDA_ENV}" +======= + docker-image: ci-image:pytorch-linux-jammy-linter + fetch-depth: 0 + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + script: | +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # Ensure no non-breaking spaces # NB: We use 'printf' below rather than '\u000a' since bash pre-4.2 # does not support the '\u000a' syntax (which is relevant for local linters) @@ -103,7 +139,11 @@ jobs: if: github.event_name == 'pull_request' && !contains(github.event.pull_request.labels.*.name, 'skip-pr-sanity-checks') steps: - name: Checkout PyTorch +<<<<<<< HEAD uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.7 +======= + uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: submodules: false fetch-depth: -1 @@ -116,22 +156,35 @@ jobs: bash .github/scripts/pr-sanity-check.sh workflow-checks: +<<<<<<< HEAD uses: pytorch/test-infra/.github/workflows/linux_job.yml@release/2.7 +======= + uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) needs: get-label-type with: timeout: 120 runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" +<<<<<<< HEAD docker-image: pytorch-linux-focal-linter +======= + docker-image: ci-image:pytorch-linux-jammy-linter +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) fetch-depth: -1 submodules: true ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} script: | +<<<<<<< HEAD # The generic Linux job chooses to use base env, not the one setup by the image CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]") conda activate "${CONDA_ENV}" # Regenerate workflows export RELEASE_VERSION_TAG=2.7 +======= + # Regenerate workflows + export RELEASE_VERSION_TAG=2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) .github/scripts/generate_ci_workflows.py RC=0 @@ -155,11 +208,16 @@ jobs: exit $RC toc: +<<<<<<< HEAD uses: pytorch/test-infra/.github/workflows/linux_job.yml@release/2.7 +======= + uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) needs: get-label-type with: timeout: 120 runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" +<<<<<<< HEAD docker-image: pytorch-linux-focal-linter fetch-depth: 0 ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} @@ -168,6 +226,12 @@ jobs: CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]") conda activate "${CONDA_ENV}" +======= + docker-image: ci-image:pytorch-linux-jammy-linter + fetch-depth: 0 + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + script: | +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # Regenerate ToCs and check that they didn't change set -eu @@ -195,11 +259,16 @@ jobs: test-tools: name: Test tools if: ${{ github.repository == 'pytorch/pytorch' }} +<<<<<<< HEAD uses: pytorch/test-infra/.github/workflows/linux_job.yml@release/2.7 +======= + uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) needs: get-label-type with: timeout: 120 runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" +<<<<<<< HEAD docker-image: pytorch-linux-focal-linter fetch-depth: 0 ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} @@ -208,6 +277,12 @@ jobs: CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]") conda activate "${CONDA_ENV}" +======= + docker-image: ci-image:pytorch-linux-jammy-linter + fetch-depth: 0 + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + script: | +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # Test tools PYTHONPATH=$(pwd) pytest tools/stats PYTHONPATH=$(pwd) pytest tools/test -o "python_files=test*.py" @@ -219,12 +294,20 @@ jobs: runs-on: linux.24_04.4x steps: - name: Checkout PyTorch +<<<<<<< HEAD uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.7 +======= + uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: submodules: false fetch-depth: 1 - name: Setup Python 3.9 +<<<<<<< HEAD uses: actions/setup-python@v4 +======= + uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: python-version: '3.9' architecture: x64 @@ -242,15 +325,32 @@ jobs: test_collect_env: if: ${{ github.repository == 'pytorch/pytorch' }} name: Test collect_env +<<<<<<< HEAD runs-on: linux.24_04.4x strategy: matrix: test_type: [with_torch, without_torch, older_python_version] +======= + runs-on: ${{ matrix.runner }} + strategy: + matrix: + include: + - test_type: with_torch + runner: linux.24_04.4x + - test_type: without_torch + runner: linux.24_04.4x + - test_type: older_python_version + runner: linux.24_04.4x +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) steps: # [see note: pytorch repo ref] # deep clone (fetch-depth 0) required, to allow us to use git log - name: Checkout PyTorch +<<<<<<< HEAD uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.7 +======= + uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: submodules: false fetch-depth: 1 @@ -263,7 +363,11 @@ jobs: echo "MIN_PYTHON_VERSION=$(python3 .github/scripts/get_ci_variable.py --min-python-version)" >> "${GITHUB_OUTPUT}" - name: Setup Old Python version if: matrix.test_type == 'older_python_version' +<<<<<<< HEAD uses: actions/setup-python@v4 +======= + uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: python-version: 3.8 architecture: x64 @@ -273,7 +377,11 @@ jobs: **/requirements.txt - name: Setup Min Python version if: matrix.test_type != 'older_python_version' +<<<<<<< HEAD uses: actions/setup-python@v4 +======= + uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: python-version: ${{ steps.get-min-python-version.outputs.MIN_PYTHON_VERSION }} architecture: x64 @@ -292,6 +400,18 @@ jobs: # All we need to see is that it passes python3 torch/utils/collect_env.py +<<<<<<< HEAD +======= + link-check: + name: Link checks + needs: get-label-type + uses: ./.github/workflows/_link_check.yml + with: + runner: ${{ needs.get-label-type.outputs.label-type }} + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + secrets: inherit + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} cancel-in-progress: true diff --git a/.github/workflows/linux-aarch64.yml b/.github/workflows/linux-aarch64.yml index 31dcc855de4b..c422607a0d08 100644 --- a/.github/workflows/linux-aarch64.yml +++ b/.github/workflows/linux-aarch64.yml @@ -19,7 +19,11 @@ jobs: get-label-type: if: github.repository_owner == 'pytorch' name: get-label-type +<<<<<<< HEAD uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.7 +======= + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: triggering_actor: ${{ github.triggering_actor }} issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} @@ -33,6 +37,7 @@ jobs: with: runner_prefix: ${{ needs.get-label-type.outputs.label-type }} build-environment: linux-jammy-aarch64-py3.10 +<<<<<<< HEAD docker-image-name: pytorch-linux-jammy-aarch64-py3.10-gcc11 runner: linux.arm64.2xlarge test-matrix: | @@ -44,6 +49,18 @@ jobs: { config: "default", shard: 1, num_shards: 3, runner: "linux.arm64.m7g.4xlarge" }, { config: "default", shard: 2, num_shards: 3, runner: "linux.arm64.m7g.4xlarge" }, { config: "default", shard: 3, num_shards: 3, runner: "linux.arm64.m7g.4xlarge" }, +======= + docker-image-name: ci-image:pytorch-linux-jammy-aarch64-py3.10-gcc11 + runner: linux.arm64.m7g.4xlarge + test-matrix: | + { include: [ + { config: "default", shard: 1, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.arm64.m7g.4xlarge" }, + { config: "default", shard: 2, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.arm64.m7g.4xlarge" }, + { config: "default", shard: 3, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.arm64.m7g.4xlarge" }, + { config: "default", shard: 1, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.arm64.m8g.4xlarge" }, + { config: "default", shard: 2, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.arm64.m8g.4xlarge" }, + { config: "default", shard: 3, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.arm64.m8g.4xlarge" }, +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ]} secrets: inherit diff --git a/.github/workflows/llm_td_retrieval.yml b/.github/workflows/llm_td_retrieval.yml index 3b7baeb04f44..6fe0e391b885 100644 --- a/.github/workflows/llm_td_retrieval.yml +++ b/.github/workflows/llm_td_retrieval.yml @@ -12,7 +12,11 @@ jobs: name: get-label-type # Don't run on forked repos if: github.repository_owner == 'pytorch' +<<<<<<< HEAD uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.7 +======= + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: triggering_actor: ${{ github.triggering_actor }} issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} @@ -27,7 +31,11 @@ jobs: needs: get-label-type steps: - name: Clone PyTorch +<<<<<<< HEAD uses: actions/checkout@v3 +======= + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: repository: pytorch/pytorch fetch-depth: 0 @@ -37,31 +45,48 @@ jobs: uses: ./pytorch/.github/actions/setup-linux - name: Clone CodeLlama +<<<<<<< HEAD uses: actions/checkout@v3 +======= + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: repository: osalpekar/codellama ref: main path: codellama - name: Clone Target Determination Code +<<<<<<< HEAD uses: actions/checkout@v3 +======= + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: repository: osalpekar/llm-target-determinator ref: v0.0.2 path: llm-target-determinator +<<<<<<< HEAD - name: Setup miniconda uses: pytorch/test-infra/.github/actions/setup-miniconda@release/2.7 with: python-version: "3.9" +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: Install requirements shell: bash run: | set -euxo pipefail +<<<<<<< HEAD ${CONDA_RUN} pip install -r llm-target-determinator/requirements.txt cd "${GITHUB_WORKSPACE}/codellama" ${CONDA_RUN} pip install -e . +======= + python3 -m pip install -r llm-target-determinator/requirements.txt + cd "${GITHUB_WORKSPACE}/codellama" + python3 -m pip install -e . +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: Fetch CodeLlama Checkpoint shell: bash @@ -72,7 +97,11 @@ jobs: aws s3 cp "s3://target-determinator-assets/CodeLlama-7b-Python" "CodeLlama-7b-Python" --recursive --no-progress - name: Fetch indexes +<<<<<<< HEAD uses: nick-fields/retry@v3.0.0 +======= + uses: nick-fields/retry@7152eba30c6575329ac0576536151aca5a72780e # v3.0.0 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: max_attempts: 3 retry_wait_seconds: 10 @@ -80,7 +109,11 @@ jobs: shell: bash command: | set -euxo pipefail +<<<<<<< HEAD ${CONDA_RUN} python -m pip install awscli==1.29.40 +======= + python3 -m pip install awscli==1.29.40 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) cd "${GITHUB_WORKSPACE}"/llm-target-determinator/assets aws s3 cp "s3://target-determinator-assets/indexes/latest" . --recursive @@ -94,7 +127,12 @@ jobs: run: | set -euxo pipefail cd "${GITHUB_WORKSPACE}"/llm-target-determinator +<<<<<<< HEAD ${CONDA_RUN} torchrun \ +======= + export PATH="$HOME/.local/bin:$PATH" + torchrun \ +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) --standalone \ --nnodes=1 \ --nproc-per-node=1 \ @@ -105,7 +143,11 @@ jobs: zip -r mappings.zip mappings - name: Upload results to s3 +<<<<<<< HEAD uses: seemethere/upload-artifact-s3@v5 +======= + uses: seemethere/upload-artifact-s3@baba72d0712b404f646cebe0730933554ebce96a # v5.1.0 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if: ${{ steps.run_retriever.outcome == 'success' }} with: name: llm_results @@ -120,5 +162,9 @@ jobs: AWS_REGION: "" - name: Teardown Linux +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/teardown-linux@release/2.7 +======= + uses: pytorch/test-infra/.github/actions/teardown-linux@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if: always() diff --git a/.github/workflows/mac-mps.yml b/.github/workflows/mac-mps.yml index c655b66d31c1..077506af36a8 100644 --- a/.github/workflows/mac-mps.yml +++ b/.github/workflows/mac-mps.yml @@ -23,7 +23,11 @@ jobs: runner-type: macos-m1-stable build-generates-artifacts: true # To match the one pre-installed in the m1 runners +<<<<<<< HEAD python-version: 3.9.12 +======= + python-version: 3.12.7 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # The runner macos-m2-14 is not a typo, it's a custom runner that is different # than our AWS macos-m1-14 runners test-matrix: | @@ -36,12 +40,22 @@ jobs: macos-py3-arm64-mps-test: name: macos-py3-arm64-mps +<<<<<<< HEAD uses: ./.github/workflows/_mac-test-mps.yml +======= + uses: ./.github/workflows/_mac-test.yml +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) needs: macos-py3-arm64-build with: sync-tag: macos-py3-arm64-mps-test build-environment: macos-py3-arm64 # Same as the build job +<<<<<<< HEAD python-version: 3.9.12 test-matrix: ${{ needs.macos-py3-arm64-build.outputs.test-matrix }} +======= + python-version: 3.12.7 + test-matrix: ${{ needs.macos-py3-arm64-build.outputs.test-matrix }} + disable-monitor: false +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) secrets: inherit diff --git a/.github/workflows/nightly-s3-uploads.yml b/.github/workflows/nightly-s3-uploads.yml index fc52df29b521..419a21b8de09 100644 --- a/.github/workflows/nightly-s3-uploads.yml +++ b/.github/workflows/nightly-s3-uploads.yml @@ -23,12 +23,20 @@ jobs: environment: upload-stats steps: - name: Checkout PyTorch +<<<<<<< HEAD uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.7 +======= + uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: fetch-depth: 1 submodules: false +<<<<<<< HEAD - uses: actions/setup-python@v4 +======= + - uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: python-version: '3.11' cache: pip @@ -37,13 +45,21 @@ jobs: pip3 install requests==2.32.2 boto3==1.35.42 - name: Authenticate with AWS +<<<<<<< HEAD uses: aws-actions/configure-aws-credentials@v4 +======= + uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: role-to-assume: arn:aws:iam::308535385114:role/gha_upload_external_contrib_stats aws-region: us-east-1 - name: Upload external contribution stats +<<<<<<< HEAD uses: nick-fields/retry@v3.0.0 +======= + uses: nick-fields/retry@7152eba30c6575329ac0576536151aca5a72780e # v3.0.0 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} with: diff --git a/.github/workflows/nightly.yml b/.github/workflows/nightly.yml index 3014ac777a5c..82d8542ce9ce 100644 --- a/.github/workflows/nightly.yml +++ b/.github/workflows/nightly.yml @@ -5,8 +5,12 @@ on: - cron: 0 0 * * * push: tags: +<<<<<<< HEAD # Final Release tags look like: v1.11.0 - v[0-9]+.[0-9]+.[0-9]+ +======= + # NOTE: Doc build pipelines should only get triggered on release candidate builds +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # Release candidate tags look like: v1.11.0-rc1 - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+ - ciflow/nightly/* @@ -20,7 +24,11 @@ concurrency: jobs: get-label-type: name: get-label-type +<<<<<<< HEAD uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.7 +======= + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }} with: triggering_actor: ${{ github.triggering_actor }} @@ -28,6 +36,18 @@ jobs: curr_branch: ${{ github.head_ref || github.ref_name }} curr_ref_type: ${{ github.ref_type }} +<<<<<<< HEAD +======= + link-check: + name: Link checks + needs: get-label-type + uses: ./.github/workflows/_link_check.yml + with: + runner: ${{ needs.get-label-type.outputs.label-type }} + ref: ${{ github.sha }} + secrets: inherit + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) docs-build: name: docs build uses: ./.github/workflows/_linux-build.yml @@ -35,7 +55,11 @@ jobs: with: runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" build-environment: linux-jammy-py3.9-gcc11 +<<<<<<< HEAD docker-image-name: pytorch-linux-jammy-py3.9-gcc11 +======= + docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) secrets: inherit docs-push: @@ -48,7 +72,11 @@ jobs: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build-environment: linux-jammy-py3.9-gcc11 docker-image: ${{ needs.docs-build.outputs.docker-image }} +<<<<<<< HEAD push: ${{ github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' || (startsWith(github.event.ref, 'refs/tags/v') && !contains(github.event.ref, 'rc')) }} +======= + push: ${{ github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' || startsWith(github.event.ref, 'refs/tags/v') }} +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) run-doxygen: true secrets: GH_PYTORCHBOT_TOKEN: ${{ secrets.GH_PYTORCHBOT_TOKEN }} @@ -79,7 +107,11 @@ jobs: if: github.repository_owner == 'pytorch' && (github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') steps: - name: "${{ matrix.repo-owner }}/${{ matrix.repo-name }} update-commit-hash" +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/update-commit-hash@release/2.7 +======= + uses: pytorch/test-infra/.github/actions/update-commit-hash@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: repo-owner: ${{ matrix.repo-owner }} repo-name: ${{ matrix.repo-name }} diff --git a/.github/workflows/nitpicker.yml b/.github/workflows/nitpicker.yml index 4c769a2b9e02..5cf28563573f 100644 --- a/.github/workflows/nitpicker.yml +++ b/.github/workflows/nitpicker.yml @@ -19,7 +19,11 @@ jobs: if: ${{ github.event.pull_request.number != 26921 && github.repository_owner == 'pytorch' }} steps: - name: Checkout PyTorch +<<<<<<< HEAD uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.7 +======= + uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - uses: ethanis/nitpicker@v1 with: nitpicks: '.github/nitpicks.yml' diff --git a/.github/workflows/operator_benchmark.yml b/.github/workflows/operator_benchmark.yml new file mode 100644 index 000000000000..810602b9c57b --- /dev/null +++ b/.github/workflows/operator_benchmark.yml @@ -0,0 +1,59 @@ +name: operator_benchmark + +on: + push: + tags: + - ciflow/op-benchmark/* + workflow_dispatch: + inputs: + test_mode: + required: false + type: string + default: 'short' + description: tag filter for operator benchmarks, options from long, short, all + schedule: + # Run at 07:00 UTC every Sunday + - cron: 0 7 * * 0 + +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} + cancel-in-progress: true + +permissions: read-all + +jobs: + linux-jammy-cpu-py3_9-gcc11-opbenchmark-build: + if: github.repository_owner == 'pytorch' + name: linux-jammy-cpu-py3.9-gcc11-opbenchmark + uses: ./.github/workflows/_linux-build.yml + with: + build-environment: linux-jammy-py3.9-gcc11-build + docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks + test-matrix: | + { include: [ + { config: "cpu_operator_benchmark_short", shard: 1, num_shards: 1, runner: "linux.12xlarge" }, + ]} + secrets: inherit + + linux-jammy-cpu-py3_9-gcc11-opbenchmark-on-demand-build: + if: ${{ github.event_name == 'workflow_dispatch' && github.repository_owner == 'pytorch' }} + name: linux-jammy-cpu-py3.9-gcc11-opbenchmark + uses: ./.github/workflows/_linux-build.yml + with: + build-environment: linux-jammy-py3.9-gcc11-build + docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks + test-matrix: | + { include: [ + { config: "cpu_operator_benchmark_${{ inputs.test_mode }}", shard: 1, num_shards: 1, runner: "linux.12xlarge" }, + ]} + secrets: inherit + + linux-jammy-cpu-py3_9-gcc11-opbenchmark-test: + name: linux-jammy-cpu-py3.9-gcc11-opbenchmark + uses: ./.github/workflows/_linux-test.yml + needs: linux-jammy-cpu-py3_9-gcc11-opbenchmark-build + with: + build-environment: linux-jammy-py3.9-gcc11-build + docker-image: ${{ needs.linux-jammy-cpu-py3_9-gcc11-opbenchmark-build.outputs.docker-image }} + test-matrix: ${{ needs.linux-jammy-cpu-py3_9-gcc11-opbenchmark-build.outputs.test-matrix }} + secrets: inherit diff --git a/.github/workflows/periodic-rocm-mi300.yml b/.github/workflows/periodic-rocm-mi300.yml new file mode 100644 index 000000000000..e0983577cec5 --- /dev/null +++ b/.github/workflows/periodic-rocm-mi300.yml @@ -0,0 +1,81 @@ +name: periodic-rocm-mi300 + +on: + schedule: + # We have several schedules so jobs can check github.event.schedule to activate only for a fraction of the runs. + # Also run less frequently on weekends. + - cron: 45 0,8,16 * * 1-5 + - cron: 45 4 * * 0,6 + - cron: 45 4,12,20 * * 1-5 + - cron: 45 12 * * 0,6 + - cron: 29 8 * * * # about 1:29am PDT, for mem leak check and rerun disabled tests + push: + tags: + - ciflow/periodic-rocm-mi300/* + branches: + - release/* + workflow_dispatch: + +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}-${{ github.event.schedule }} + cancel-in-progress: true + +permissions: read-all + +jobs: + llm-td: + if: github.repository_owner == 'pytorch' + name: before-test + uses: ./.github/workflows/llm_td_retrieval.yml + permissions: + id-token: write + contents: read + + target-determination: + name: before-test + uses: ./.github/workflows/target_determination.yml + needs: llm-td + permissions: + id-token: write + contents: read + + get-label-type: + name: get-label-type + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8 + if: (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' + with: + triggering_actor: ${{ github.triggering_actor }} + issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} + curr_branch: ${{ github.head_ref || github.ref_name }} + curr_ref_type: ${{ github.ref_type }} + + linux-jammy-rocm-py3_10-build: + name: linux-jammy-rocm-py3.10-mi300 + uses: ./.github/workflows/_linux-build.yml + needs: get-label-type + with: + runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + build-environment: linux-jammy-rocm-py3.10-mi300 + docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3 + test-matrix: | + { include: [ + { config: "distributed", shard: 1, num_shards: 3, runner: "linux.rocm.gpu.mi300.4", owners: ["module:rocm", "oncall:distributed"] }, + { config: "distributed", shard: 2, num_shards: 3, runner: "linux.rocm.gpu.mi300.4", owners: ["module:rocm", "oncall:distributed"] }, + { config: "distributed", shard: 3, num_shards: 3, runner: "linux.rocm.gpu.mi300.4", owners: ["module:rocm", "oncall:distributed"] }, + ]} + secrets: inherit + + linux-jammy-rocm-py3_10-test: + permissions: + id-token: write + contents: read + name: linux-jammy-rocm-py3.10-mi300 + uses: ./.github/workflows/_rocm-test.yml + needs: + - linux-jammy-rocm-py3_10-build + - target-determination + with: + build-environment: linux-jammy-rocm-py3.10-mi300 + docker-image: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.docker-image }} + test-matrix: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.test-matrix }} + secrets: inherit diff --git a/.github/workflows/periodic.yml b/.github/workflows/periodic.yml index 76953638d64c..9f525e1096f9 100644 --- a/.github/workflows/periodic.yml +++ b/.github/workflows/periodic.yml @@ -41,7 +41,11 @@ jobs: get-label-type: name: get-label-type +<<<<<<< HEAD uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.7 +======= + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if: (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' with: triggering_actor: ${{ github.triggering_actor }} @@ -49,14 +53,24 @@ jobs: curr_branch: ${{ github.head_ref || github.ref_name }} curr_ref_type: ${{ github.ref_type }} +<<<<<<< HEAD linux-focal-cuda12_6-py3_10-gcc11-build: name: linux-focal-cuda12.6-py3.10-gcc11 +======= + linux-jammy-cuda12_8-py3_10-gcc11-build: + name: linux-jammy-cuda12.8-py3.10-gcc11 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) uses: ./.github/workflows/_linux-build.yml needs: get-label-type with: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" +<<<<<<< HEAD build-environment: linux-focal-cuda12.6-py3.10-gcc11 docker-image-name: pytorch-linux-focal-cuda12.6-cudnn9-py3-gcc11 +======= + build-environment: linux-jammy-cuda12.8-py3.10-gcc11 + docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) test-matrix: | { include: [ { config: "nogpu_AVX512", shard: 1, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" }, @@ -68,6 +82,7 @@ jobs: ]} secrets: inherit +<<<<<<< HEAD linux-focal-cuda12_6-py3_10-gcc11-test: name: linux-focal-cuda12.6-py3.10-gcc11 uses: ./.github/workflows/_linux-test.yml @@ -82,12 +97,33 @@ jobs: linux-focal-cuda11_8-py3_9-gcc9-build: name: linux-focal-cuda11.8-py3.9-gcc9 +======= + linux-jammy-cuda12_8-py3_10-gcc11-test: + name: linux-jammy-cuda12.8-py3.10-gcc11 + uses: ./.github/workflows/_linux-test.yml + needs: + - linux-jammy-cuda12_8-py3_10-gcc11-build + - target-determination + with: + build-environment: linux-jammy-cuda12.8-py3.10-gcc11 + docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-build.outputs.docker-image }} + test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-build.outputs.test-matrix }} + secrets: inherit + + linux-jammy-cuda12_8-py3_9-gcc9-build: + name: linux-jammy-cuda12.8-py3.9-gcc9 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) uses: ./.github/workflows/_linux-build.yml needs: get-label-type with: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" +<<<<<<< HEAD build-environment: linux-focal-cuda11.8-py3.9-gcc9 docker-image-name: pytorch-linux-focal-cuda11.8-cudnn9-py3-gcc9 +======= + build-environment: linux-jammy-cuda12.8-py3.9-gcc9 + docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) cuda-arch-list: 8.6 test-matrix: | { include: [ @@ -97,6 +133,7 @@ jobs: build-with-debug: false secrets: inherit +<<<<<<< HEAD linux-focal-cuda11_8-py3_9-gcc9-test: name: linux-focal-cuda11.8-py3.9-gcc9 uses: ./.github/workflows/_linux-test.yml @@ -109,12 +146,31 @@ jobs: linux-focal-cuda11_8-py3_10-gcc9-debug-build: name: linux-focal-cuda11.8-py3.10-gcc9-debug +======= + linux-jammy-cuda12_8-py3_9-gcc9-test: + name: linux-jammy-cuda12.8-py3.9-gcc9 + uses: ./.github/workflows/_linux-test.yml + needs: linux-jammy-cuda12_8-py3_9-gcc9-build + with: + build-environment: linux-jammy-cuda12.8-py3.9-gcc9 + docker-image: ${{ needs.linux-jammy-cuda12_8-py3_9-gcc9-build.outputs.docker-image }} + test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_9-gcc9-build.outputs.test-matrix }} + secrets: inherit + + linux-jammy-cuda12_8-py3_10-gcc9-debug-build: + name: linux-jammy-cuda12.8-py3.10-gcc9-debug +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) uses: ./.github/workflows/_linux-build.yml needs: get-label-type with: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" +<<<<<<< HEAD build-environment: linux-focal-cuda11.8-py3.10-gcc9-debug docker-image-name: pytorch-linux-focal-cuda11.8-cudnn9-py3-gcc9 +======= + build-environment: linux-jammy-cuda12.8-py3.10-gcc9-debug + docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) build-with-debug: true test-matrix: | { include: [ @@ -128,6 +184,7 @@ jobs: ]} secrets: inherit +<<<<<<< HEAD linux-focal-cuda11_8-py3_10-gcc9-debug-test: name: linux-focal-cuda11.8-py3.10-gcc9-debug uses: ./.github/workflows/_linux-test.yml @@ -142,12 +199,33 @@ jobs: linux-focal-rocm6_3-py3_10-build: name: linux-focal-rocm6.3-py3.10 +======= + linux-jammy-cuda12_8-py3_10-gcc9-debug-test: + name: linux-jammy-cuda12.8-py3.10-gcc9-debug + uses: ./.github/workflows/_linux-test.yml + needs: + - linux-jammy-cuda12_8-py3_10-gcc9-debug-build + - target-determination + with: + build-environment: linux-jammy-cuda12.8-py3.10-gcc9-debug + docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-debug-build.outputs.docker-image }} + test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-debug-build.outputs.test-matrix }} + secrets: inherit + + linux-jammy-rocm-py3_10-build: + name: linux-jammy-rocm-py3.10 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) uses: ./.github/workflows/_linux-build.yml needs: get-label-type with: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" +<<<<<<< HEAD build-environment: linux-focal-rocm6.3-py3.10 docker-image-name: pytorch-linux-focal-rocm-n-py3 +======= + build-environment: linux-jammy-rocm-py3.10 + docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) test-matrix: | { include: [ { config: "distributed", shard: 1, num_shards: 3, runner: "linux.rocm.gpu.4", owners: ["module:rocm", "oncall:distributed"] }, @@ -156,6 +234,7 @@ jobs: ]} secrets: inherit +<<<<<<< HEAD linux-focal-rocm6_3-py3_10-test: permissions: id-token: write @@ -173,10 +252,30 @@ jobs: linux-focal-cuda12_6-py3-gcc11-slow-gradcheck-build: name: linux-focal-cuda12.6-py3-gcc11-slow-gradcheck +======= + linux-jammy-rocm-py3_10-test: + permissions: + id-token: write + contents: read + name: linux-jammy-rocm-py3.10 + uses: ./.github/workflows/_rocm-test.yml + needs: + - linux-jammy-rocm-py3_10-build + - target-determination + with: + build-environment: linux-jammy-rocm-py3.10 + docker-image: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.docker-image }} + test-matrix: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.test-matrix }} + secrets: inherit + + linux-jammy-cuda12_8-py3-gcc11-slow-gradcheck-build: + name: linux-jammy-cuda12.8-py3-gcc11-slow-gradcheck +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) uses: ./.github/workflows/_linux-build.yml needs: get-label-type with: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" +<<<<<<< HEAD build-environment: linux-focal-cuda12.6-py3-gcc11-slow-gradcheck docker-image-name: pytorch-linux-focal-cuda12.6-cudnn9-py3-gcc11 cuda-arch-list: 8.6 @@ -220,3 +319,33 @@ jobs: { config: "default", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" }, ]} secrets: inherit +======= + build-environment: linux-jammy-cuda12.8-py3-gcc11-slow-gradcheck + docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11 + cuda-arch-list: 8.6 + test-matrix: | + { include: [ + { config: "default", shard: 1, num_shards: 8, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu", owners: ["module:slowgradcheck"] }, + { config: "default", shard: 2, num_shards: 8, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu", owners: ["module:slowgradcheck"] }, + { config: "default", shard: 3, num_shards: 8, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu", owners: ["module:slowgradcheck"] }, + { config: "default", shard: 4, num_shards: 8, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu", owners: ["module:slowgradcheck"] }, + { config: "default", shard: 5, num_shards: 8, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu", owners: ["module:slowgradcheck"] }, + { config: "default", shard: 6, num_shards: 8, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu", owners: ["module:slowgradcheck"] }, + { config: "default", shard: 7, num_shards: 8, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu", owners: ["module:slowgradcheck"] }, + { config: "default", shard: 8, num_shards: 8, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu", owners: ["module:slowgradcheck"] }, + ]} + secrets: inherit + + linux-jammy-cuda12_8-py3-gcc11-slow-gradcheck-test: + name: linux-jammy-cuda12.8-py3-gcc11-slow-gradcheck + uses: ./.github/workflows/_linux-test.yml + needs: + - linux-jammy-cuda12_8-py3-gcc11-slow-gradcheck-build + - target-determination + with: + build-environment: linux-jammy-cuda12.8-py3-gcc11-slow-gradcheck + docker-image: ${{ needs.linux-jammy-cuda12_8-py3-gcc11-slow-gradcheck-build.outputs.docker-image }} + test-matrix: ${{ needs.linux-jammy-cuda12_8-py3-gcc11-slow-gradcheck-build.outputs.test-matrix }} + timeout-minutes: 300 + secrets: inherit +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml index 2881210f3f3d..169e6fca4d5e 100644 --- a/.github/workflows/pull.yml +++ b/.github/workflows/pull.yml @@ -9,6 +9,11 @@ on: - main - release/* - landchecks/* +<<<<<<< HEAD +======= + tags: + - ciflow/pull/* +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) workflow_dispatch: schedule: - cron: 29 8 * * * # about 1:29am PDT @@ -38,7 +43,11 @@ jobs: get-label-type: name: get-label-type +<<<<<<< HEAD uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.7 +======= + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }} with: triggering_actor: ${{ github.triggering_actor }} @@ -52,7 +61,11 @@ jobs: with: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build-environment: linux-jammy-py3.9-gcc11 +<<<<<<< HEAD docker-image-name: pytorch-linux-jammy-py3.9-gcc11 +======= + docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) test-matrix: | { include: [ { config: "default", shard: 1, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" }, @@ -79,7 +92,10 @@ jobs: build-environment: linux-jammy-py3.9-gcc11 docker-image: ${{ needs.linux-jammy-py3_9-gcc11-build.outputs.docker-image }} test-matrix: ${{ needs.linux-jammy-py3_9-gcc11-build.outputs.test-matrix }} +<<<<<<< HEAD timeout-minutes: 300 +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) secrets: inherit linux-docs: @@ -98,7 +114,11 @@ jobs: with: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build-environment: linux-jammy-py3.9-gcc11-no-ops +<<<<<<< HEAD docker-image-name: pytorch-linux-jammy-py3.9-gcc11 +======= + docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) test-matrix: | { include: [ { config: "default", shard: 1, num_shards: 1 }, @@ -112,21 +132,35 @@ jobs: with: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build-environment: linux-jammy-py3.9-gcc11-pch +<<<<<<< HEAD docker-image-name: pytorch-linux-jammy-py3.9-gcc11 +======= + docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) test-matrix: | { include: [ { config: "default", shard: 1, num_shards: 1 }, ]} secrets: inherit +<<<<<<< HEAD linux-jammy-py3_10-clang15-asan-build: name: linux-jammy-py3.10-clang15-asan +======= + linux-jammy-py3_10-clang18-asan-build: + name: linux-jammy-py3.10-clang18-asan +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) uses: ./.github/workflows/_linux-build.yml needs: get-label-type with: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" +<<<<<<< HEAD build-environment: linux-jammy-py3.10-clang15-asan docker-image-name: pytorch-linux-jammy-py3-clang15-asan +======= + build-environment: linux-jammy-py3.10-clang18-asan + docker-image-name: ci-image:pytorch-linux-jammy-py3-clang18-asan +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) test-matrix: | { include: [ { config: "default", shard: 1, num_shards: 6, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" }, @@ -140,6 +174,7 @@ jobs: secrets: inherit +<<<<<<< HEAD linux-jammy-py3_10-clang15-asan-test: name: linux-jammy-py3.10-clang15-asan uses: ./.github/workflows/_linux-test.yml @@ -155,12 +190,34 @@ jobs: linux-focal-py3_9-clang10-onnx-build: name: linux-focal-py3.9-clang10-onnx +======= + linux-jammy-py3_10-clang18-asan-test: + name: linux-jammy-py3.10-clang18-asan + uses: ./.github/workflows/_linux-test.yml + needs: + - linux-jammy-py3_10-clang18-asan-build + - target-determination + with: + build-environment: linux-jammy-py3.10-clang18-asan + docker-image: ${{ needs.linux-jammy-py3_10-clang18-asan-build.outputs.docker-image }} + test-matrix: ${{ needs.linux-jammy-py3_10-clang18-asan-build.outputs.test-matrix }} + sync-tag: asan-test + secrets: inherit + + linux-jammy-py3_9-clang12-onnx-build: + name: linux-jammy-py3.9-clang12-onnx +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) uses: ./.github/workflows/_linux-build.yml needs: get-label-type with: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" +<<<<<<< HEAD build-environment: linux-focal-py3.9-clang10-onnx docker-image-name: pytorch-linux-focal-py3-clang10-onnx +======= + build-environment: linux-jammy-py3.9-clang12-onnx + docker-image-name: ci-image:pytorch-linux-jammy-py3-clang12-onnx +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) test-matrix: | { include: [ { config: "default", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" }, @@ -168,6 +225,7 @@ jobs: ]} secrets: inherit +<<<<<<< HEAD linux-focal-py3_9-clang10-onnx-test: name: linux-focal-py3.9-clang10-onnx uses: ./.github/workflows/_linux-test.yml @@ -217,12 +275,33 @@ jobs: linux-focal-py3_13-clang10-build: name: linux-focal-py3.13-clang10 +======= + linux-jammy-py3_9-clang12-onnx-test: + name: linux-jammy-py3.9-clang12-onnx + uses: ./.github/workflows/_linux-test.yml + needs: + - linux-jammy-py3_9-clang12-onnx-build + - target-determination + with: + build-environment: linux-jammy-py3.9-clang12-onnx + docker-image: ${{ needs.linux-jammy-py3_9-clang12-onnx-build.outputs.docker-image }} + test-matrix: ${{ needs.linux-jammy-py3_9-clang12-onnx-build.outputs.test-matrix }} + secrets: inherit + + linux-jammy-py3_9-clang12-build: + name: linux-jammy-py3.9-clang12 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) uses: ./.github/workflows/_linux-build.yml needs: get-label-type with: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" +<<<<<<< HEAD build-environment: linux-focal-py3.13-clang10 docker-image-name: pytorch-linux-focal-py3.13-clang10 +======= + build-environment: linux-jammy-py3.9-clang12 + docker-image-name: ci-image:pytorch-linux-jammy-py3.9-clang12 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) test-matrix: | { include: [ { config: "default", shard: 1, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" }, @@ -235,6 +314,7 @@ jobs: { config: "dynamo_wrapped", shard: 1, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" }, { config: "dynamo_wrapped", shard: 2, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" }, { config: "dynamo_wrapped", shard: 3, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" }, +<<<<<<< HEAD ]} secrets: inherit @@ -251,12 +331,72 @@ jobs: linux-focal-cuda11_8-py3_10-gcc9-build: name: linux-focal-cuda11.8-py3.10-gcc9 +======= + { config: "einops", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" } + ]} + secrets: inherit + + linux-jammy-py3_9-clang12-test: + name: linux-jammy-py3.9-clang12 + uses: ./.github/workflows/_linux-test.yml + needs: + - linux-jammy-py3_9-clang12-build + - target-determination + with: + build-environment: linux-jammy-py3.9-clang12 + docker-image: ${{ needs.linux-jammy-py3_9-clang12-build.outputs.docker-image }} + test-matrix: ${{ needs.linux-jammy-py3_9-clang12-build.outputs.test-matrix }} + secrets: inherit + + linux-jammy-py3_13-clang12-build: + name: linux-jammy-py3.13-clang12 + uses: ./.github/workflows/_linux-build.yml + needs: get-label-type + with: + runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + build-environment: linux-jammy-py3.13-clang12 + docker-image-name: ci-image:pytorch-linux-jammy-py3.13-clang12 + test-matrix: | + { include: [ + { config: "default", shard: 1, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" }, + { config: "default", shard: 2, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" }, + { config: "default", shard: 3, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" }, + { config: "default", shard: 4, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" }, + { config: "default", shard: 5, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" }, + { config: "crossref", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" }, + { config: "crossref", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" }, + { config: "dynamo_wrapped", shard: 1, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" }, + { config: "dynamo_wrapped", shard: 2, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" }, + { config: "dynamo_wrapped", shard: 3, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" }, + { config: "einops", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" } + ]} + secrets: inherit + + linux-jammy-py3_13-clang12-test: + name: linux-jammy-py3.13-clang12 + uses: ./.github/workflows/_linux-test.yml + needs: linux-jammy-py3_13-clang12-build + with: + build-environment: linux-jammy-py3.13-clang12 + docker-image: ${{ needs.linux-jammy-py3_13-clang12-build.outputs.docker-image }} + test-matrix: ${{ needs.linux-jammy-py3_13-clang12-build.outputs.test-matrix }} + timeout-minutes: 600 + secrets: inherit + + linux-jammy-cuda12_8-py3_10-gcc11-build-distributed: + name: linux-jammy-cuda12.8-py3.10-gcc11-build-distributed +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) uses: ./.github/workflows/_linux-build.yml needs: get-label-type with: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" +<<<<<<< HEAD build-environment: linux-focal-cuda11.8-py3.10-gcc9 docker-image-name: pytorch-linux-focal-cuda11.8-cudnn9-py3-gcc9 +======= + build-environment: linux-jammy-cuda12.8-py3.10-gcc11-distributed + docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) cuda-arch-list: '7.5' test-matrix: | { include: [ @@ -266,6 +406,7 @@ jobs: ]} secrets: inherit +<<<<<<< HEAD linux-focal-cuda11_8-py3_10-gcc9-test: name: linux-focal-cuda11.8-py3.10-gcc9 uses: ./.github/workflows/_linux-test.yml @@ -281,12 +422,34 @@ jobs: linux-focal-cuda12_6-py3_10-gcc11-build: name: linux-focal-cuda12.6-py3.10-gcc11 +======= + linux-jammy-cuda12_8-py3_10-gcc11-test-distributed: + name: linux-jammy-cuda12.8-py3.10-gcc11-test + uses: ./.github/workflows/_linux-test.yml + needs: + - linux-jammy-cuda12_8-py3_10-gcc11-build-distributed + - target-determination + with: + timeout-minutes: 360 + build-environment: linux-jammy-cuda12.8-py3.10-gcc11-distributed + docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-build-distributed.outputs.docker-image }} + test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-build-distributed.outputs.test-matrix }} + secrets: inherit + + linux-jammy-cuda12_8-py3_10-gcc11-build: + name: linux-jammy-cuda12.8-py3.10-gcc11 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) uses: ./.github/workflows/_linux-build.yml needs: get-label-type with: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" +<<<<<<< HEAD build-environment: linux-focal-cuda12.6-py3.10-gcc11 docker-image-name: pytorch-linux-focal-cuda12.6-cudnn9-py3-gcc11 +======= + build-environment: linux-jammy-cuda12.8-py3.10-gcc11 + docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) test-matrix: | { include: [ { config: "default", shard: 1, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" }, @@ -297,6 +460,7 @@ jobs: ]} secrets: inherit +<<<<<<< HEAD linux-focal-cuda12_6-py3_10-gcc11-test: name: linux-focal-cuda12.6-py3.10-gcc11 uses: ./.github/workflows/_linux-test.yml @@ -308,6 +472,19 @@ jobs: build-environment: linux-focal-cuda12.6-py3.10-gcc11 docker-image: ${{ needs.linux-focal-cuda12_6-py3_10-gcc11-build.outputs.docker-image }} test-matrix: ${{ needs.linux-focal-cuda12_6-py3_10-gcc11-build.outputs.test-matrix }} +======= + linux-jammy-cuda12_8-py3_10-gcc11-test: + name: linux-jammy-cuda12.8-py3.10-gcc11 + uses: ./.github/workflows/_linux-test.yml + needs: + - linux-jammy-cuda12_8-py3_10-gcc11-build + - target-determination + with: + timeout-minutes: 360 + build-environment: linux-jammy-cuda12.8-py3.10-gcc11 + docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-build.outputs.docker-image }} + test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-build.outputs.test-matrix }} +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) secrets: inherit linux-jammy-py3-clang12-mobile-build: @@ -317,7 +494,11 @@ jobs: with: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build-environment: linux-jammy-py3-clang12-mobile-build +<<<<<<< HEAD docker-image-name: pytorch-linux-jammy-py3-clang15-asan +======= + docker-image-name: ci-image:pytorch-linux-jammy-py3-clang15-asan +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) build-generates-artifacts: false test-matrix: | { include: [ @@ -325,27 +506,46 @@ jobs: ]} secrets: inherit +<<<<<<< HEAD linux-jammy-cuda-11_8-cudnn9-py3_9-clang12-build: name: linux-jammy-cuda11.8-cudnn9-py3.9-clang12 +======= + linux-jammy-cuda12_8-cudnn9-py3_9-clang12-build: + name: linux-jammy-cuda12.8-cudnn9-py3.9-clang12 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) uses: ./.github/workflows/_linux-build.yml needs: get-label-type with: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" +<<<<<<< HEAD build-environment: linux-jammy-cuda11.8-cudnn9-py3.9-clang12 docker-image-name: pytorch-linux-jammy-cuda11.8-cudnn9-py3.9-clang12 +======= + build-environment: linux-jammy-cuda12.8-cudnn9-py3.9-clang12 + docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3.9-clang12 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) test-matrix: | { include: [ { config: "default", shard: 1, num_shards: 1 }, ]} secrets: inherit +<<<<<<< HEAD linux-focal-py3_9-clang9-xla-build: name: linux-focal-py3_9-clang9-xla +======= + linux-jammy-py3_9-clang9-xla-build: + name: linux-jammy-py3_9-clang9-xla +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) uses: ./.github/workflows/_linux-build.yml needs: get-label-type with: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" +<<<<<<< HEAD build-environment: linux-focal-py3.9-clang9-xla +======= + build-environment: linux-jammy-py3.9-clang9-xla +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) docker-image-name: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/xla_base:v1.3-lite test-matrix: | { include: [ @@ -353,6 +553,7 @@ jobs: ]} secrets: inherit +<<<<<<< HEAD linux-focal-py3_9-clang9-xla-test: name: linux-focal-py3_9-clang9-xla uses: ./.github/workflows/_linux-test.yml @@ -384,12 +585,31 @@ jobs: linux-focal-cpu-py3_10-gcc11-bazel-test: name: linux-focal-cpu-py3.10-gcc11-bazel-test +======= + linux-jammy-py3_9-clang9-xla-test: + name: linux-jammy-py3_9-clang9-xla + uses: ./.github/workflows/_linux-test.yml + needs: linux-jammy-py3_9-clang9-xla-build + with: + build-environment: linux-jammy-py3.9-clang9-xla + docker-image: ${{ needs.linux-jammy-py3_9-clang9-xla-build.outputs.docker-image }} + test-matrix: ${{ needs.linux-jammy-py3_9-clang9-xla-build.outputs.test-matrix }} + secrets: inherit + + linux-jammy-cpu-py3_10-gcc11-bazel-test: + name: linux-jammy-cpu-py3.10-gcc11-bazel-test +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) uses: ./.github/workflows/_bazel-build-test.yml needs: get-label-type with: runner: "${{ needs.get-label-type.outputs.label-type }}linux.large" +<<<<<<< HEAD build-environment: linux-focal-cuda12.6-py3.10-gcc11-bazel-test docker-image-name: pytorch-linux-focal-cuda12.6-cudnn9-py3-gcc11 +======= + build-environment: linux-jammy-cuda12.8-py3.10-gcc11-bazel-test + docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) cuda-version: cpu test-matrix: | { include: [ @@ -404,7 +624,11 @@ jobs: with: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build-environment: linux-jammy-py3.9-gcc11-mobile-lightweight-dispatch-build +<<<<<<< HEAD docker-image-name: pytorch-linux-jammy-py3.9-gcc11 +======= + docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) build-generates-artifacts: false test-matrix: | { include: [ @@ -412,16 +636,28 @@ jobs: ]} secrets: inherit +<<<<<<< HEAD linux-focal-rocm6_3-py3_10-build: # don't run build twice on main if: github.event_name == 'pull_request' name: linux-focal-rocm6.3-py3.10 +======= + linux-jammy-rocm-py3_10-build: + # don't run build twice on main + if: github.event_name == 'pull_request' + name: linux-jammy-rocm-py3.10 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) uses: ./.github/workflows/_linux-build.yml needs: get-label-type with: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" +<<<<<<< HEAD build-environment: linux-focal-rocm6.3-py3.10 docker-image-name: pytorch-linux-focal-rocm-n-py3 +======= + build-environment: linux-jammy-rocm-py3.10 + docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) sync-tag: rocm-build test-matrix: | { include: [ @@ -431,14 +667,24 @@ jobs: ]} secrets: inherit +<<<<<<< HEAD linux-focal-cuda12_6-py3_10-gcc11-sm89-build: name: linux-focal-cuda12.6-py3.10-gcc11-sm89 +======= + linux-jammy-cuda12_8-py3_10-gcc11-sm89-build: + name: linux-jammy-cuda12.8-py3.10-gcc11-sm89 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) uses: ./.github/workflows/_linux-build.yml needs: get-label-type with: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" +<<<<<<< HEAD build-environment: linux-focal-cuda12.6-py3.10-gcc11-sm89 docker-image-name: pytorch-linux-focal-cuda12.6-cudnn9-py3-gcc11 +======= + build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm89 + docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) cuda-arch-list: 8.9 test-matrix: | { include: [ @@ -450,6 +696,7 @@ jobs: ]} secrets: inherit +<<<<<<< HEAD unstable-linux-focal-cuda12_6-py3_10-gcc11-sm89-build-xfail: # A version of the build that sets a larger number of jobs for a build. May # OOM @@ -480,6 +727,18 @@ jobs: build-environment: linux-focal-cuda12.6-py3.10-gcc11-sm89 docker-image: ${{ needs.linux-focal-cuda12_6-py3_10-gcc11-sm89-build.outputs.docker-image }} test-matrix: ${{ needs.linux-focal-cuda12_6-py3_10-gcc11-sm89-build.outputs.test-matrix }} +======= + linux-jammy-cuda12_8-py3_10-gcc11-sm89-test: + name: linux-jammy-cuda12.8-py3.10-gcc11-sm89 + uses: ./.github/workflows/_linux-test.yml + needs: + - linux-jammy-cuda12_8-py3_10-gcc11-sm89-build + - target-determination + with: + build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm89 + docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-sm89-build.outputs.docker-image }} + test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-sm89-build.outputs.test-matrix }} +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) secrets: inherit linux-jammy-py3-clang12-executorch-build: @@ -489,7 +748,11 @@ jobs: with: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build-environment: linux-jammy-py3-clang12-executorch +<<<<<<< HEAD docker-image-name: pytorch-linux-jammy-py3-clang12-executorch +======= + docker-image-name: ci-image:pytorch-linux-jammy-py3-clang12-executorch +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) test-matrix: | { include: [ { config: "executorch", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" }, @@ -500,20 +763,34 @@ jobs: name: linux-jammy-py3-clang12-executorch uses: ./.github/workflows/_linux-test.yml needs: linux-jammy-py3-clang12-executorch-build +<<<<<<< HEAD +======= + if: false # Has been broken for a while +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: build-environment: linux-jammy-py3-clang12-executorch docker-image: ${{ needs.linux-jammy-py3-clang12-executorch-build.outputs.docker-image }} test-matrix: ${{ needs.linux-jammy-py3-clang12-executorch-build.outputs.test-matrix }} secrets: inherit +<<<<<<< HEAD linux-focal-cuda12_4-py3_10-gcc9-inductor-build: name: cuda12.4-py3.10-gcc9-sm75 +======= + linux-jammy-cuda12_8-py3_10-gcc9-inductor-build: + name: cuda12.8-py3.10-gcc9-sm75 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) uses: ./.github/workflows/_linux-build.yml needs: get-label-type with: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" +<<<<<<< HEAD build-environment: linux-focal-cuda12.4-py3.10-gcc9-sm75 docker-image-name: pytorch-linux-focal-cuda12.4-cudnn9-py3-gcc9-inductor-benchmarks +======= + build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm75 + docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) cuda-arch-list: '7.5' test-matrix: | { include: [ @@ -521,6 +798,7 @@ jobs: ]} secrets: inherit +<<<<<<< HEAD linux-focal-cuda12_4-py3_10-gcc9-inductor-test: name: cuda12.4-py3.10-gcc9-sm75 uses: ./.github/workflows/_linux-test.yml @@ -540,6 +818,27 @@ jobs: runner_prefix: ${{ needs.get-label-type.outputs.label-type }} build-environment: linux-jammy-xpu-2025.0-py3.9 docker-image-name: pytorch-linux-jammy-xpu-2025.0-py3 +======= + linux-jammy-cuda12_8-py3_10-gcc9-inductor-test: + name: cuda12.8-py3.10-gcc9-sm75 + uses: ./.github/workflows/_linux-test.yml + needs: linux-jammy-cuda12_8-py3_10-gcc9-inductor-build + with: + build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm75 + docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-build.outputs.docker-image }} + test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-build.outputs.test-matrix }} + secrets: inherit + + linux-jammy-xpu-2025_1-py3_9-build: + name: linux-jammy-xpu-2025.1-py3.9 + uses: ./.github/workflows/_linux-build.yml + needs: get-label-type + with: + sync-tag: linux-xpu-2025-1-build + runner_prefix: ${{ needs.get-label-type.outputs.label-type }} + build-environment: linux-jammy-xpu-2025.1-py3.9 + docker-image-name: ci-image:pytorch-linux-jammy-xpu-2025.1-py3 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) test-matrix: | { include: [ { config: "default", shard: 1, num_shards: 4, runner: "linux.idc.xpu" }, diff --git a/.github/workflows/revert.yml b/.github/workflows/revert.yml index 5b3cb1265c8b..708b7e234f56 100644 --- a/.github/workflows/revert.yml +++ b/.github/workflows/revert.yml @@ -13,14 +13,22 @@ jobs: GH_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }} steps: - name: Checkout repo +<<<<<<< HEAD uses: actions/checkout@v2 +======= + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) id: checkout with: fetch-depth: 0 token: ${{ secrets.MERGEBOT_TOKEN }} - name: Setup Python +<<<<<<< HEAD uses: actions/setup-python@v4 +======= + uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: python-version: '3.9' architecture: x64 diff --git a/.github/workflows/rocm-mi300.yml b/.github/workflows/rocm-mi300.yml index e83e776223a6..1b462fe10fe3 100644 --- a/.github/workflows/rocm-mi300.yml +++ b/.github/workflows/rocm-mi300.yml @@ -28,7 +28,11 @@ jobs: get-label-type: name: get-label-type +<<<<<<< HEAD uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.7 +======= + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }} with: triggering_actor: ${{ github.triggering_actor }} @@ -36,15 +40,26 @@ jobs: curr_branch: ${{ github.head_ref || github.ref_name }} curr_ref_type: ${{ github.ref_type }} +<<<<<<< HEAD linux-focal-rocm6_3-py3_10-build: if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }} name: linux-focal-rocm6.3-py3.10 +======= + linux-jammy-rocm-py3_10-build: + if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }} + name: linux-jammy-rocm-py3.10-mi300 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) uses: ./.github/workflows/_linux-build.yml needs: get-label-type with: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" +<<<<<<< HEAD build-environment: linux-focal-rocm6.3-py3.10 docker-image-name: pytorch-linux-focal-rocm-n-py3 +======= + build-environment: linux-jammy-rocm-py3.10-mi300 + docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) sync-tag: rocm-build test-matrix: | { include: [ @@ -57,6 +72,7 @@ jobs: ]} secrets: inherit +<<<<<<< HEAD linux-focal-rocm6_3-py3_10-test: permissions: id-token: write @@ -70,4 +86,19 @@ jobs: build-environment: linux-focal-rocm6.3-py3.10 docker-image: ${{ needs.linux-focal-rocm6_3-py3_10-build.outputs.docker-image }} test-matrix: ${{ needs.linux-focal-rocm6_3-py3_10-build.outputs.test-matrix }} +======= + linux-jammy-rocm-py3_10-test: + permissions: + id-token: write + contents: read + name: linux-jammy-rocm-py3.10-mi300 + uses: ./.github/workflows/_rocm-test.yml + needs: + - linux-jammy-rocm-py3_10-build + - target-determination + with: + build-environment: linux-jammy-rocm-py3.10-mi300 + docker-image: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.docker-image }} + test-matrix: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.test-matrix }} +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) secrets: inherit diff --git a/.github/workflows/rocm.yml b/.github/workflows/rocm.yml index 6ff8667a9d94..af105e7e07ca 100644 --- a/.github/workflows/rocm.yml +++ b/.github/workflows/rocm.yml @@ -26,6 +26,7 @@ jobs: id-token: write contents: read +<<<<<<< HEAD linux-focal-rocm6_3-py3_10-build: if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }} name: linux-focal-rocm6.3-py3.10 @@ -33,6 +34,15 @@ jobs: with: build-environment: linux-focal-rocm6.3-py3.10 docker-image-name: pytorch-linux-focal-rocm-n-py3 +======= + linux-jammy-rocm-py3_10-build: + if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }} + name: linux-jammy-rocm-py3.10 + uses: ./.github/workflows/_linux-build.yml + with: + build-environment: linux-jammy-rocm-py3.10 + docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) sync-tag: rocm-build test-matrix: | { include: [ @@ -45,6 +55,7 @@ jobs: ]} secrets: inherit +<<<<<<< HEAD linux-focal-rocm6_3-py3_10-test: permissions: id-token: write @@ -58,4 +69,19 @@ jobs: build-environment: linux-focal-rocm6.3-py3.10 docker-image: ${{ needs.linux-focal-rocm6_3-py3_10-build.outputs.docker-image }} test-matrix: ${{ needs.linux-focal-rocm6_3-py3_10-build.outputs.test-matrix }} +======= + linux-jammy-rocm-py3_10-test: + permissions: + id-token: write + contents: read + name: linux-jammy-rocm-py3.10 + uses: ./.github/workflows/_rocm-test.yml + needs: + - linux-jammy-rocm-py3_10-build + - target-determination + with: + build-environment: linux-jammy-rocm-py3.10 + docker-image: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.docker-image }} + test-matrix: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.test-matrix }} +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) secrets: inherit diff --git a/.github/workflows/runner-determinator-validator.yml b/.github/workflows/runner-determinator-validator.yml index 72581829f7a0..df2de9fbc861 100644 --- a/.github/workflows/runner-determinator-validator.yml +++ b/.github/workflows/runner-determinator-validator.yml @@ -20,7 +20,11 @@ jobs: steps: - name: Checkout repository +<<<<<<< HEAD uses: actions/checkout@v2 +======= + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: Run Hardcode runner-determinator script id: hardcode-script @@ -38,4 +42,8 @@ jobs: # version embedded into .github/workflows/_runner-determinator.yml diff runner_determinator_workflow.py .github/scripts/runner_determinator.py # Fail the job if the scripts are not identical - continue-on-error: false \ No newline at end of file +<<<<<<< HEAD + continue-on-error: false +======= + continue-on-error: false +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) diff --git a/.github/workflows/runner_determinator_script_sync.yaml b/.github/workflows/runner_determinator_script_sync.yaml index a47c3b418860..102793c8162a 100644 --- a/.github/workflows/runner_determinator_script_sync.yaml +++ b/.github/workflows/runner_determinator_script_sync.yaml @@ -15,7 +15,11 @@ jobs: runs-on: ubuntu-latest steps: +<<<<<<< HEAD - uses: actions/checkout@v4 +======= + - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: sparse-checkout: | .github diff --git a/.github/workflows/s390.yml b/.github/workflows/s390.yml index f95af2bd1561..eeacc99ee0b2 100644 --- a/.github/workflows/s390.yml +++ b/.github/workflows/s390.yml @@ -2,8 +2,11 @@ name: s390 on: push: +<<<<<<< HEAD branches: - main +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) tags: - ciflow/s390/* workflow_dispatch: @@ -21,6 +24,10 @@ jobs: uses: ./.github/workflows/_linux-build.yml with: build-environment: linux-s390x-binary-manywheel +<<<<<<< HEAD docker-image-name: pytorch/manylinuxs390x-builder:cpu-s390x-main +======= + docker-image-name: pytorch/manylinuxs390x-builder:cpu-s390x +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) runner: linux.s390x secrets: inherit diff --git a/.github/workflows/s390x-periodic.yml b/.github/workflows/s390x-periodic.yml index 67f68fcaee9a..07c027bfb0e1 100644 --- a/.github/workflows/s390x-periodic.yml +++ b/.github/workflows/s390x-periodic.yml @@ -9,8 +9,11 @@ on: tags: - ciflow/periodic/* - ciflow/s390/* +<<<<<<< HEAD branches: - release/* +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) workflow_dispatch: concurrency: @@ -42,7 +45,11 @@ jobs: uses: ./.github/workflows/_linux-build.yml with: build-environment: linux-s390x-binary-manywheel +<<<<<<< HEAD docker-image-name: pytorch/manylinuxs390x-builder:cpu-s390x-main +======= + docker-image-name: pytorch/manylinuxs390x-builder:cpu-s390x +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) runner: linux.s390x test-matrix: | { include: [ @@ -70,8 +77,14 @@ jobs: - target-determination with: build-environment: linux-s390x-binary-manywheel +<<<<<<< HEAD docker-image: pytorch/manylinuxs390x-builder:cpu-s390x-main test-matrix: ${{ needs.linux-manylinux-2_28-py3-cpu-s390x-build.outputs.test-matrix }} timeout-minutes: 480 +======= + docker-image: pytorch/manylinuxs390x-builder:cpu-s390x + test-matrix: ${{ needs.linux-manylinux-2_28-py3-cpu-s390x-build.outputs.test-matrix }} + timeout-minutes: 600 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) use-gha: "yes" secrets: inherit diff --git a/.github/workflows/scorecards.yml b/.github/workflows/scorecards.yml index 9567e15d2f5d..96581b082787 100644 --- a/.github/workflows/scorecards.yml +++ b/.github/workflows/scorecards.yml @@ -25,12 +25,20 @@ jobs: steps: - name: "Checkout code" +<<<<<<< HEAD uses: actions/checkout@v3 +======= + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: persist-credentials: false - name: "Run analysis" +<<<<<<< HEAD uses: ossf/scorecard-action@865b4092859256271290c77adbd10a43f4779972 # tag=v2.0.3 +======= + uses: ossf/scorecard-action@865b4092859256271290c77adbd10a43f4779972 # v2.0.3 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: results_file: results.sarif results_format: sarif @@ -42,7 +50,11 @@ jobs: # Upload the results as artifacts (optional). Commenting out will disable uploads of run results in SARIF # format to the repository Actions tab. - name: "Upload artifact" +<<<<<<< HEAD uses: actions/upload-artifact@v4 +======= + uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: name: SARIF file path: results.sarif @@ -50,6 +62,10 @@ jobs: # Upload the results to GitHub's code scanning dashboard. - name: "Upload to code-scanning" +<<<<<<< HEAD uses: github/codeql-action/upload-sarif@5f532563584d71fdef14ee64d17bafb34f751ce5 # tag=v1.0.26 +======= + uses: github/codeql-action/upload-sarif@5f532563584d71fdef14ee64d17bafb34f751ce5 # v1.0.26 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: sarif_file: results.sarif diff --git a/.github/workflows/slow.yml b/.github/workflows/slow.yml index b0c73f0a3969..496f9186b1e4 100644 --- a/.github/workflows/slow.yml +++ b/.github/workflows/slow.yml @@ -39,7 +39,11 @@ jobs: get-label-type: name: get-label-type +<<<<<<< HEAD uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.7 +======= + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }} with: triggering_actor: ${{ github.triggering_actor }} @@ -47,14 +51,24 @@ jobs: curr_branch: ${{ github.head_ref || github.ref_name }} curr_ref_type: ${{ github.ref_type }} +<<<<<<< HEAD linux-focal-cuda12_6-py3_10-gcc11-sm86-build: name: linux-focal-cuda12.6-py3.10-gcc11-sm86 +======= + linux-jammy-cuda12_8-py3_10-gcc11-sm86-build: + name: linux-jammy-cuda12.8-py3.10-gcc11-sm86 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) uses: ./.github/workflows/_linux-build.yml needs: get-label-type with: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" +<<<<<<< HEAD build-environment: linux-focal-cuda12.6-py3.10-gcc11-sm86 docker-image-name: pytorch-linux-focal-cuda12.6-cudnn9-py3-gcc11 +======= + build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm86 + docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) cuda-arch-list: 8.6 test-matrix: | { include: [ @@ -64,6 +78,7 @@ jobs: ]} secrets: inherit +<<<<<<< HEAD linux-focal-cuda12_6-py3_10-gcc11-sm86-test: name: linux-focal-cuda12.6-py3.10-gcc11-sm86 uses: ./.github/workflows/_linux-test.yml @@ -78,12 +93,33 @@ jobs: linux-focal-py3_9-clang10-build: name: linux-focal-py3.9-clang10 +======= + linux-jammy-cuda12_8-py3_10-gcc11-sm86-test: + name: linux-jammy-cuda12.8-py3.10-gcc11-sm86 + uses: ./.github/workflows/_linux-test.yml + needs: + - linux-jammy-cuda12_8-py3_10-gcc11-sm86-build + - target-determination + with: + build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm86 + docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-sm86-build.outputs.docker-image }} + test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-sm86-build.outputs.test-matrix }} + secrets: inherit + + linux-jammy-py3_9-clang12-build: + name: linux-jammy-py3.9-clang12 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) uses: ./.github/workflows/_linux-build.yml needs: get-label-type with: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" +<<<<<<< HEAD build-environment: linux-focal-py3.9-clang10 docker-image-name: pytorch-linux-focal-py3.9-clang10 +======= + build-environment: linux-jammy-py3.9-clang12 + docker-image-name: ci-image:pytorch-linux-jammy-py3.9-clang12 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) test-matrix: | { include: [ { config: "slow", shard: 1, num_shards: 2, runner: "linux.2xlarge" }, @@ -91,6 +127,7 @@ jobs: ]} secrets: inherit +<<<<<<< HEAD linux-focal-py3_9-clang10-test: name: linux-focal-py3.9-clang10 uses: ./.github/workflows/_linux-test.yml @@ -105,12 +142,33 @@ jobs: linux-focal-rocm6_3-py3_10-build: name: linux-focal-rocm6.3-py3.10 +======= + linux-jammy-py3_9-clang12-test: + name: linux-jammy-py3.9-clang12 + uses: ./.github/workflows/_linux-test.yml + needs: + - linux-jammy-py3_9-clang12-build + - target-determination + with: + build-environment: linux-jammy-py3.9-clang12 + docker-image: ${{ needs.linux-jammy-py3_9-clang12-build.outputs.docker-image }} + test-matrix: ${{ needs.linux-jammy-py3_9-clang12-build.outputs.test-matrix }} + secrets: inherit + + linux-jammy-rocm-py3_10-build: + name: linux-jammy-rocm-py3.10 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) uses: ./.github/workflows/_linux-build.yml needs: get-label-type with: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" +<<<<<<< HEAD build-environment: linux-focal-rocm6.3-py3.10 docker-image-name: pytorch-linux-focal-rocm-n-py3 +======= + build-environment: linux-jammy-rocm-py3.10 + docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) test-matrix: | { include: [ { config: "slow", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.2", owners: ["module:rocm"] }, @@ -118,6 +176,7 @@ jobs: ]} secrets: inherit +<<<<<<< HEAD linux-focal-rocm6_3-py3_10-test: permissions: id-token: write @@ -135,10 +194,30 @@ jobs: linux-jammy-py3_10-clang15-asan-build: name: linux-jammy-py3.10-clang15-asan +======= + linux-jammy-rocm-py3_10-test: + permissions: + id-token: write + contents: read + name: linux-jammy-rocm-py3.10 + uses: ./.github/workflows/_rocm-test.yml + needs: + - linux-jammy-rocm-py3_10-build + - target-determination + with: + build-environment: linux-jammy-rocm-py3.10 + docker-image: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.docker-image }} + test-matrix: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.test-matrix }} + secrets: inherit + + linux-jammy-py3_10-clang18-asan-build: + name: linux-jammy-py3.10-clang18-asan +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) uses: ./.github/workflows/_linux-build.yml needs: get-label-type with: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" +<<<<<<< HEAD build-environment: linux-jammy-py3.10-clang15-asan docker-image-name: pytorch-linux-jammy-py3-clang15-asan test-matrix: | @@ -146,10 +225,20 @@ jobs: { config: "slow", shard: 1, num_shards: 3, runner: "linux.4xlarge" }, { config: "slow", shard: 2, num_shards: 3, runner: "linux.4xlarge" }, { config: "slow", shard: 3, num_shards: 3, runner: "linux.4xlarge" }, +======= + build-environment: linux-jammy-py3.10-clang18-asan + docker-image-name: ci-image:pytorch-linux-jammy-py3-clang18-asan + test-matrix: | + { include: [ + { config: "slow", shard: 1, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" }, + { config: "slow", shard: 2, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" }, + { config: "slow", shard: 3, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" }, +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ]} sync-tag: asan-build secrets: inherit +<<<<<<< HEAD linux-jammy-py3_10-clang15-asan-test: name: linux-jammy-py3.10-clang15-asan uses: ./.github/workflows/_linux-test.yml @@ -160,5 +249,17 @@ jobs: build-environment: linux-jammy-py3.10-clang15-asan docker-image: ${{ needs.linux-jammy-py3_10-clang15-asan-build.outputs.docker-image }} test-matrix: ${{ needs.linux-jammy-py3_10-clang15-asan-build.outputs.test-matrix }} +======= + linux-jammy-py3_10-clang18-asan-test: + name: linux-jammy-py3.10-clang18-asan + uses: ./.github/workflows/_linux-test.yml + needs: + - linux-jammy-py3_10-clang18-asan-build + - target-determination + with: + build-environment: linux-jammy-py3.10-clang18-asan + docker-image: ${{ needs.linux-jammy-py3_10-clang18-asan-build.outputs.docker-image }} + test-matrix: ${{ needs.linux-jammy-py3_10-clang18-asan-build.outputs.test-matrix }} +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) sync-tag: asan-test secrets: inherit diff --git a/.github/workflows/stale.yml b/.github/workflows/stale.yml index 047e4a47ab97..08822d4943f4 100644 --- a/.github/workflows/stale.yml +++ b/.github/workflows/stale.yml @@ -27,7 +27,11 @@ jobs: pull-requests: write steps: +<<<<<<< HEAD - uses: actions/github-script@v6 +======= + - uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: script: | // Do some dumb retries on requests. diff --git a/.github/workflows/target-determination-indexer.yml b/.github/workflows/target-determination-indexer.yml index 363b59b78054..af0d59fead54 100644 --- a/.github/workflows/target-determination-indexer.yml +++ b/.github/workflows/target-determination-indexer.yml @@ -13,7 +13,11 @@ jobs: get-label-type: if: github.repository_owner == 'pytorch' name: get-label-type +<<<<<<< HEAD uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.7 +======= + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: triggering_actor: ${{ github.triggering_actor }} issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} @@ -26,7 +30,11 @@ jobs: environment: target-determinator-env steps: - name: Clone PyTorch +<<<<<<< HEAD uses: actions/checkout@v3 +======= + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: path: pytorch @@ -35,9 +43,15 @@ jobs: - name: Calculate docker image id: calculate-docker-image +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.7 with: docker-image-name: pytorch-linux-focal-cuda12.4-cudnn9-py3-gcc9 +======= + uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.8 + with: + docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) working-directory: pytorch - name: Use following to pull public copy of the image @@ -46,34 +60,57 @@ jobs: ECR_DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }} shell: bash run: | +<<<<<<< HEAD tag=${ECR_DOCKER_IMAGE##*/} echo "docker pull ghcr.io/pytorch/ci-image:${tag/:/-}" - name: Pull docker image uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.7 +======= + tag=${ECR_DOCKER_IMAGE##*:} + echo "docker pull ghcr.io/pytorch/ci-image:${tag/:/-}" + + - name: Pull docker image + uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG id: install-nvidia-driver +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/setup-nvidia@release/2.7 - name: Clone CodeLlama uses: actions/checkout@v3 +======= + uses: pytorch/test-infra/.github/actions/setup-nvidia@release/2.8 + + - name: Clone CodeLlama + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: repository: osalpekar/codellama ref: 1ec50e0cfc0fadc3b6ceb146617e2119ab26eb34 path: codellama - name: Clone Target Determination Code +<<<<<<< HEAD uses: actions/checkout@v3 +======= + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: repository: osalpekar/llm-target-determinator ref: v0.0.2 path: llm-target-determinator - name: Configure AWS credentials +<<<<<<< HEAD uses: aws-actions/configure-aws-credentials@v3 +======= + uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: role-to-assume: arn:aws:iam::308535385114:role/gha_target_determinator_s3_read_write aws-region: us-east-1 @@ -100,6 +137,11 @@ jobs: AWS_DEFAULT_REGION: us-east-1 run: | # detached container should get cleaned up by teardown_ec2_linux +<<<<<<< HEAD +======= + # Disable shellcheck warning for GPU_FLAG + # shellcheck disable=SC2086 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) container_name=$(docker run \ ${GPU_FLAG:-} \ -e MAX_JOBS="$(nproc --ignore=2)" \ @@ -147,7 +189,11 @@ jobs: "s3://target-determinator-assets/indexes/latest/${ZIP_NAME}" - name: Teardown Linux +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/teardown-linux@release/2.7 +======= + uses: pytorch/test-infra/.github/actions/teardown-linux@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if: always() concurrency: diff --git a/.github/workflows/target_determination.yml b/.github/workflows/target_determination.yml index 7ed28deb94f2..fcb2586a1e9a 100644 --- a/.github/workflows/target_determination.yml +++ b/.github/workflows/target_determination.yml @@ -9,7 +9,11 @@ jobs: name: get-label-type # Don't run on forked repos if: github.repository_owner == 'pytorch' +<<<<<<< HEAD uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.7 +======= + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: triggering_actor: ${{ github.triggering_actor }} issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} @@ -27,7 +31,11 @@ jobs: # checkout because when we run this action we don't *have* a local # checkout. In other cases you should prefer a local checkout. - name: Checkout PyTorch +<<<<<<< HEAD uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.7 +======= + uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: submodules: false @@ -49,7 +57,11 @@ jobs: job_identifier: ${{ github.workflow }} - name: Download LLM Artifacts from S3 +<<<<<<< HEAD uses: seemethere/download-artifact-s3@v4 +======= + uses: seemethere/download-artifact-s3@1da556a7aa0a088e3153970611f6c432d58e80e6 # v4.2.0 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) continue-on-error: true with: name: llm_results @@ -76,7 +88,11 @@ jobs: python3 tools/testing/do_target_determination_for_s3.py - name: Upload TD results to s3 +<<<<<<< HEAD uses: seemethere/upload-artifact-s3@v5 +======= + uses: seemethere/upload-artifact-s3@baba72d0712b404f646cebe0730933554ebce96a # v5.1.0 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if: steps.td.outcome == 'success' with: name: td_results @@ -85,7 +101,11 @@ jobs: path: td_results.json - name: Store TD results on GHA +<<<<<<< HEAD uses: actions/upload-artifact@v4 +======= + uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if: steps.td.outcome == 'success' with: name: td_results.json diff --git a/.github/workflows/test-check-binary.yml b/.github/workflows/test-check-binary.yml index c6898d36353e..c30166dfd5c9 100644 --- a/.github/workflows/test-check-binary.yml +++ b/.github/workflows/test-check-binary.yml @@ -15,26 +15,47 @@ jobs: check_binary_linux_cpu: if: github.repository_owner == 'pytorch' name: Test check_binary.sh for Linux CPU +<<<<<<< HEAD uses: pytorch/test-infra/.github/workflows/linux_job.yml@release/2.7 +======= + uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: docker-image: python:3.11 docker-build-dir: "skip-docker-build" script: | pushd .ci/pytorch/ +<<<<<<< HEAD pip install --pre torch --index-url https://download.pytorch.org/whl/test/cpu DESIRED_PYTHON=3.11 DESIRED_CUDA=cpu DESIRED_DEVTOOLSET=cxx11-abi PACKAGE_TYPE=manywheel ./check_binary.sh +======= + pip install --pre torch --index-url https://download.pytorch.org/whl/nightly/cpu + DESIRED_PYTHON=3.11 DESIRED_CUDA=cpu PACKAGE_TYPE=manywheel ./check_binary.sh +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) popd check_binary_linux_cuda: if: github.repository_owner == 'pytorch' name: Test check_binary.sh for Linux CUDA +<<<<<<< HEAD uses: pytorch/test-infra/.github/workflows/linux_job.yml@release/2.7 +======= + uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: runner: linux.4xlarge.nvidia.gpu docker-image: python:3.11 docker-build-dir: "skip-docker-build" script: | +<<<<<<< HEAD pushd .ci/pytorch/ pip install --pre torch --index-url https://download.pytorch.org/whl/test/cu126 DESIRED_PYTHON=3.11 DESIRED_CUDA=cu126 DESIRED_DEVTOOLSET=cxx11-abi PACKAGE_TYPE=manywheel ./check_binary.sh +======= + STABLE_CUDA_VERSION=$(python3 .github/scripts/get_ci_variable.py --cuda-stable-version) + CUDA_VERSION_NODOT=$(echo ${STABLE_CUDA_VERSION} | tr -d '.') + pushd .ci/pytorch/ + pip install --pre torch --index-url https://download.pytorch.org/whl/nightly/cu${CUDA_VERSION_NODOT} + DESIRED_PYTHON=3.11 DESIRED_CUDA=cu${CUDA_VERSION_NODOT} PACKAGE_TYPE=manywheel ./check_binary.sh +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) popd diff --git a/.github/workflows/test-h100.yml b/.github/workflows/test-h100.yml new file mode 100644 index 000000000000..9d92e969686f --- /dev/null +++ b/.github/workflows/test-h100.yml @@ -0,0 +1,55 @@ +name: Limited CI on H100 + +on: + pull_request: + paths: + - .github/workflows/test-h100.yml + workflow_dispatch: + schedule: + - cron: 0 4,10,16,22 * * * # every 6 hours + push: + tags: + - ciflow/h100/* + +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }} + cancel-in-progress: true + +jobs: + + get-label-type: + if: github.repository_owner == 'pytorch' + name: get-label-type + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8 + with: + triggering_actor: ${{ github.triggering_actor }} + issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} + curr_branch: ${{ github.head_ref || github.ref_name }} + curr_ref_type: ${{ github.ref_type }} + + linux-jammy-cuda12_8-py3_10-gcc11-sm90-build: + name: linux-jammy-cuda12.8-py3.10-gcc11-sm90 + uses: ./.github/workflows/_linux-build.yml + needs: get-label-type + with: + runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + runner: "linux.12xlarge" + build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm90 + docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11 + cuda-arch-list: '9.0' + test-matrix: | + { include: [ + { config: "smoke", shard: 1, num_shards: 1, runner: "linux.aws.h100" }, + ]} + secrets: inherit + + linux-jammy-cuda12_8-py3_10-gcc11-sm90-test: + name: linux-jammy-cuda12.8-py3.10-gcc11-sm90 + uses: ./.github/workflows/_linux-test.yml + needs: + - linux-jammy-cuda12_8-py3_10-gcc11-sm90-build + with: + build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm90 + docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-sm90-build.outputs.docker-image }} + test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-sm90-build.outputs.test-matrix }} + secrets: inherit diff --git a/.github/workflows/torchbench.yml b/.github/workflows/torchbench.yml index 4717c309c788..a3b617e2c5bc 100644 --- a/.github/workflows/torchbench.yml +++ b/.github/workflows/torchbench.yml @@ -14,22 +14,36 @@ jobs: get-default-label-prefix: if: github.repository_owner == 'pytorch' name: get-default-label-prefix +<<<<<<< HEAD uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.7 +======= + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: triggering_actor: ${{ github.triggering_actor }} issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} curr_branch: ${{ github.head_ref || github.ref_name }} curr_ref_type: ${{ github.ref_type }} +<<<<<<< HEAD linux-focal-cuda12_4-py3_10-gcc9-torchbench-build-gcp: name: cuda12.4-py3.10-gcc9-sm80 +======= + build: + name: cuda12.8-py3.10-gcc9-sm80 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) uses: ./.github/workflows/_linux-build.yml needs: - get-default-label-prefix with: runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}" +<<<<<<< HEAD build-environment: linux-focal-cuda12.4-py3.10-gcc9-sm80 docker-image-name: pytorch-linux-focal-cuda12.4-cudnn9-py3-gcc9-inductor-benchmarks +======= + build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80 + docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) cuda-arch-list: '8.0' test-matrix: | { include: [ @@ -37,6 +51,7 @@ jobs: ]} secrets: inherit +<<<<<<< HEAD linux-focal-cuda12_4-py3_10-gcc9-torchbench-test-gcp: name: cuda12.4-py3.10-gcc9-sm80 uses: ./.github/workflows/_linux-test.yml @@ -45,4 +60,14 @@ jobs: build-environment: linux-focal-cuda12.4-py3.10-gcc9-sm80 docker-image: ${{ needs.linux-focal-cuda12_4-py3_10-gcc9-torchbench-build-gcp.outputs.docker-image }} test-matrix: ${{ needs.linux-focal-cuda12_4-py3_10-gcc9-torchbench-build-gcp.outputs.test-matrix }} +======= + test: + name: cuda12.8-py3.10-gcc9-sm80 + uses: ./.github/workflows/_linux-test.yml + needs: build + with: + build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80 + docker-image: ${{ needs.build.outputs.docker-image }} + test-matrix: ${{ needs.build.outputs.test-matrix }} +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) secrets: inherit diff --git a/.github/workflows/trunk-tagging.yml b/.github/workflows/trunk-tagging.yml new file mode 100644 index 000000000000..b460195c37e6 --- /dev/null +++ b/.github/workflows/trunk-tagging.yml @@ -0,0 +1,224 @@ +name: trunk-tagging + +on: + push: + branches: + - main + workflow_dispatch: + inputs: + commit_sha: + description: 'Commit SHA to tag (leave empty for current HEAD)' + required: false + type: string + +concurrency: + group: trunk-tagging-${{ github.event.inputs.commit_sha || github.sha }} + cancel-in-progress: false + +permissions: + contents: write + +jobs: + tag-trunk-commit: + name: Tag trunk commit + runs-on: ubuntu-latest + if: github.repository_owner == 'pytorch' + + steps: + - name: Pre-checkout validation + run: | + # For workflow_dispatch, validate SHA format before checkout + if [ -n "${{ github.event.inputs.commit_sha }}" ]; then + COMMIT_SHA="${{ github.event.inputs.commit_sha }}" + + # Verify it's a well-formed SHA (40 hex characters) + if ! echo "${COMMIT_SHA}" | grep -qE '^[a-f0-9]{40}$'; then + echo "Error: Invalid commit SHA format. Expected 40 hexadecimal characters, got: ${COMMIT_SHA}" + exit 1 + fi + + echo "✅ Pre-checkout validation passed for: ${COMMIT_SHA}" + else + echo "✅ Using current commit SHA - no pre-checkout validation needed" + fi + + - name: Checkout repository + uses: actions/checkout@v4 + with: + # Fetch full history to ensure we have all commits + fetch-depth: 0 + # For workflow_dispatch, checkout the specified commit + ref: ${{ github.event.inputs.commit_sha || github.sha }} + + - name: Set commit SHA + id: commit + run: | + if [ -n "${{ github.event.inputs.commit_sha }}" ]; then + COMMIT_SHA="${{ github.event.inputs.commit_sha }}" + else + COMMIT_SHA="${{ github.sha }}" + fi + echo "sha=${COMMIT_SHA}" >> "${GITHUB_OUTPUT}" + echo "tag_name=trunk/${COMMIT_SHA}" >> "${GITHUB_OUTPUT}" + + - name: Validate commit SHA + run: | + COMMIT_SHA="${{ steps.commit.outputs.sha }}" + + # Verify the commit exists and is valid + if ! git cat-file -e "${COMMIT_SHA}"; then + echo "Error: Commit SHA ${COMMIT_SHA} does not exist in repository" + exit 1 + fi + + # For workflow_dispatch, verify the commit exists on main branch + if [ -n "${{ github.event.inputs.commit_sha }}" ]; then + echo "Manual dispatch detected - validating commit is on main branch..." + + # Get all commits reachable from main branch + if ! git merge-base --is-ancestor "${COMMIT_SHA}" origin/main; then + echo "Error: Commit ${COMMIT_SHA} is not reachable from main branch" + echo "Only commits that exist on the main branch can be tagged" + exit 1 + fi + + echo "✅ Commit ${COMMIT_SHA} is valid and exists on main branch" + else + echo "✅ Commit ${COMMIT_SHA} is valid (automatic push trigger)" + fi + + - name: Create and push tag with retry + id: check_tag + env: + TAG_NAME: ${{ steps.commit.outputs.tag_name }} + COMMIT_SHA: ${{ steps.commit.outputs.sha }} + run: | + set -e + + # Check if tag already exists + check_tag_exists() { + # Check if tag exists locally + if git tag -l "${TAG_NAME}" | grep -q "${TAG_NAME}"; then + echo "Tag ${TAG_NAME} already exists locally" + return 0 + fi + + # Check if tag exists on remote + if git ls-remote --tags origin "${TAG_NAME}" | grep -q "${TAG_NAME}"; then + echo "Tag ${TAG_NAME} already exists on remote" + return 0 + fi + + return 1 + } + + # Exit early if tag already exists + if check_tag_exists; then + echo "✅ Tag already exists - no action needed" + echo "exists=true" >> "${GITHUB_OUTPUT}" + exit 0 + fi + + echo "Tag ${TAG_NAME} does not exist, proceeding with creation" + + # Retry configuration + MAX_RETRIES=5 + BASE_DELAY=2 + BACKOFF_MULTIPLIER=4 + MAX_DELAY=3600 + + # Common retry function with exponential backoff + retry_with_backoff() { + local command="${1}" + local description="${2}" + local retry_count=0 + + while [ "${retry_count}" -le "${MAX_RETRIES}" ]; do + echo "Attempt $((retry_count + 1))/$((MAX_RETRIES + 1)): ${description}" + + if eval "${command}"; then + echo "Success on attempt $((retry_count + 1))" + return 0 + fi + + retry_count=$((retry_count + 1)) + + if [ "${retry_count}" -le "${MAX_RETRIES}" ]; then + # Calculate delay with exponential backoff + local delay=$((BASE_DELAY * (BACKOFF_MULTIPLIER ** retry_count))) + if [ "${delay}" -gt "${MAX_DELAY}" ]; then + delay="${MAX_DELAY}" + fi + + echo "Failed. Retrying in ${delay} seconds..." + sleep "${delay}" + fi + done + + echo "All retry attempts exhausted" + return 1 + } + + # Function to create and push tag + create_and_push_tag() { + # Create the tag + if ! git tag "${TAG_NAME}" "${COMMIT_SHA}"; then + echo "Failed to create local tag" + return 1 + fi + + # Push the tag + if git push origin "${TAG_NAME}"; then + echo "Successfully created and pushed tag ${TAG_NAME}" + return 0 + else + echo "Failed to push tag to remote" + # Clean up local tag for retry + git tag -d "${TAG_NAME}" 2>/dev/null || true + return 1 + fi + } + + # Function to handle retries with race condition checks + tag_with_retry() { + # Check if tag exists before attempting creation + if check_tag_exists; then + echo "Tag ${TAG_NAME} was created by another process, exiting successfully" + return 0 + fi + + create_and_push_tag || { + # Fetch latest state for next retry + git fetch origin --tags + return 1 + } + } + + # Execute with retry + if retry_with_backoff "tag_with_retry" "Creating tag ${TAG_NAME} for commit ${COMMIT_SHA}"; then + echo "exists=false" >> "${GITHUB_OUTPUT}" + exit 0 + else + echo "Tag creation failed after all retry attempts" + exit 1 + fi + + - name: Tag creation summary + if: always() + run: | + if [ "${{ steps.check_tag.outputs.exists }}" = "true" ]; then + echo "✅ Tag ${{ steps.commit.outputs.tag_name }} already existed - no action needed" + elif [ "${{ job.status }}" = "success" ]; then + echo "✅ Successfully created tag ${{ steps.commit.outputs.tag_name }} for commit ${{ steps.commit.outputs.sha }}" + else + echo "❌ Failed to create tag ${{ steps.commit.outputs.tag_name }} for commit ${{ steps.commit.outputs.sha }}" + fi + + echo "" + echo "Tag details:" + echo " Name: ${{ steps.commit.outputs.tag_name }}" + echo " Commit: ${{ steps.commit.outputs.sha }}" + echo " Trigger: ${{ github.event_name }}" + if [ -n "${{ github.event.inputs.commit_sha }}" ]; then + echo " Manual commit: ${{ github.event.inputs.commit_sha }}" + fi diff --git a/.github/workflows/trunk.yml b/.github/workflows/trunk.yml index 6d0fa57ef212..571842e24cdd 100644 --- a/.github/workflows/trunk.yml +++ b/.github/workflows/trunk.yml @@ -37,7 +37,11 @@ jobs: get-label-type: name: get-label-type +<<<<<<< HEAD uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.7 +======= + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }} with: triggering_actor: ${{ github.triggering_actor }} @@ -45,6 +49,7 @@ jobs: curr_branch: ${{ github.head_ref || github.ref_name }} curr_ref_type: ${{ github.ref_type }} +<<<<<<< HEAD libtorch-linux-focal-cuda12_6-py3_10-gcc11-debug-build: name: libtorch-linux-focal-cuda12.6-py3.10-gcc11-debug uses: ./.github/workflows/_linux-build.yml @@ -52,6 +57,15 @@ jobs: with: build-environment: libtorch-linux-focal-cuda12.6-py3.10-gcc11 docker-image-name: pytorch-linux-focal-cuda12.6-cudnn9-py3-gcc11 +======= + libtorch-linux-jammy-cuda12_8-py3_10-gcc11-debug-build: + name: libtorch-linux-jammy-cuda12.8-py3.10-gcc11-debug + uses: ./.github/workflows/_linux-build.yml + needs: get-label-type + with: + build-environment: libtorch-linux-jammy-cuda12.8-py3.10-gcc11 + docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) build-generates-artifacts: false runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" runner: "linux.4xlarge" @@ -62,14 +76,24 @@ jobs: secrets: inherit # no-ops builds test USE_PER_OPERATOR_HEADERS=0 where ATen/ops is not generated +<<<<<<< HEAD linux-focal-cuda12_6-py3_10-gcc11-no-ops-build: name: linux-focal-cuda12.6-py3.10-gcc11-no-ops +======= + linux-jammy-cuda12_8-py3_10-gcc11-no-ops-build: + name: linux-jammy-cuda12.8-py3.10-gcc11-no-ops +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) uses: ./.github/workflows/_linux-build.yml needs: get-label-type with: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" +<<<<<<< HEAD build-environment: linux-focal-cuda12.6-py3.10-gcc11-no-ops docker-image-name: pytorch-linux-focal-cuda12.6-cudnn9-py3-gcc11 +======= + build-environment: linux-jammy-cuda12.8-py3.10-gcc11-no-ops + docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) test-matrix: | { include: [ { config: "default", shard: 1, num_shards: 1 }, @@ -86,12 +110,17 @@ jobs: runner-type: macos-m1-stable build-generates-artifacts: true # To match the one pre-installed in the m1 runners +<<<<<<< HEAD python-version: 3.9.12 +======= + python-version: 3.12.7 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) test-matrix: | { include: [ { config: "default", shard: 1, num_shards: 3, runner: "macos-m1-stable" }, { config: "default", shard: 2, num_shards: 3, runner: "macos-m1-stable" }, { config: "default", shard: 3, num_shards: 3, runner: "macos-m1-stable" }, +<<<<<<< HEAD ]} secrets: inherit @@ -109,6 +138,11 @@ jobs: { include: [ { config: "mps", shard: 1, num_shards: 1, runner: "macos-m1-13" }, { config: "mps", shard: 1, num_shards: 1, runner: "macos-m1-14" }, +======= + { config: "mps", shard: 1, num_shards: 1, runner: "macos-m1-13" }, + { config: "mps", shard: 1, num_shards: 1, runner: "macos-m1-14" }, + { config: "mps", shard: 1, num_shards: 1, runner: "macos-m2-15" }, +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ]} secrets: inherit @@ -121,8 +155,14 @@ jobs: with: build-environment: macos-py3-arm64 # Same as the build job +<<<<<<< HEAD python-version: 3.9.12 test-matrix: ${{ needs.macos-py3-arm64-build.outputs.test-matrix }} +======= + python-version: 3.12.7 + test-matrix: ${{ needs.macos-py3-arm64-build.outputs.test-matrix }} + disable-monitor: false +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) secrets: inherit win-vs2022-cpu-py3-build: @@ -132,7 +172,10 @@ jobs: with: build-environment: win-vs2022-cpu-py3 cuda-version: cpu +<<<<<<< HEAD sync-tag: win-cpu-build +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) runner: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral" test-matrix: | { include: [ @@ -152,6 +195,10 @@ jobs: build-environment: win-vs2022-cpu-py3 cuda-version: cpu test-matrix: ${{ needs.win-vs2022-cpu-py3-build.outputs.test-matrix }} +<<<<<<< HEAD +======= + disable-monitor: false +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) secrets: inherit win-vs2022-cuda12_6-py3-build: @@ -164,15 +211,26 @@ jobs: runner: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral" secrets: inherit +<<<<<<< HEAD linux-focal-rocm6_3-py3_10-build: if: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/trunk') }} name: linux-focal-rocm6.3-py3.10 +======= + linux-jammy-rocm-py3_10-build: + if: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/trunk') }} + name: linux-jammy-rocm-py3.10 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) uses: ./.github/workflows/_linux-build.yml needs: get-label-type with: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" +<<<<<<< HEAD build-environment: linux-focal-rocm6.3-py3.10 docker-image-name: pytorch-linux-focal-rocm-n-py3 +======= + build-environment: linux-jammy-rocm-py3.10 + docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) sync-tag: rocm-build test-matrix: | { include: [ @@ -182,11 +240,16 @@ jobs: ]} secrets: inherit +<<<<<<< HEAD linux-focal-rocm6_3-py3_10-test: +======= + linux-jammy-rocm-py3_10-test: +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/trunk') }} permissions: id-token: write contents: read +<<<<<<< HEAD name: linux-focal-rocm6.3-py3.10 uses: ./.github/workflows/_rocm-test.yml needs: @@ -196,10 +259,22 @@ jobs: build-environment: linux-focal-rocm6.3-py3.10 docker-image: ${{ needs.linux-focal-rocm6_3-py3_10-build.outputs.docker-image }} test-matrix: ${{ needs.linux-focal-rocm6_3-py3_10-build.outputs.test-matrix }} +======= + name: linux-jammy-rocm-py3.10 + uses: ./.github/workflows/_rocm-test.yml + needs: + - linux-jammy-rocm-py3_10-build + - target-determination + with: + build-environment: linux-jammy-rocm-py3.10 + docker-image: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.docker-image }} + test-matrix: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.test-matrix }} +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) tests-to-include: "test_nn test_torch test_cuda test_ops test_unary_ufuncs test_binary_ufuncs test_autograd inductor/test_torchinductor distributed/test_c10d_common distributed/test_c10d_nccl" secrets: inherit # NB: Keep this in sync with inductor-perf-test-nightly.yml +<<<<<<< HEAD linux-focal-cuda12_4-py3_10-gcc9-inductor-build: name: cuda12.4-py3.10-gcc9-sm80 uses: ./.github/workflows/_linux-build.yml @@ -207,6 +282,15 @@ jobs: with: build-environment: linux-focal-cuda12.4-py3.10-gcc9-sm80 docker-image-name: pytorch-linux-focal-cuda12.4-cudnn9-py3-gcc9-inductor-benchmarks +======= + linux-jammy-cuda12_8-py3_10-gcc9-inductor-build: + name: cuda12.8-py3.10-gcc9-sm80 + uses: ./.github/workflows/_linux-build.yml + needs: get-label-type + with: + build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80 + docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) cuda-arch-list: '8.0' secrets: inherit @@ -217,7 +301,11 @@ jobs: with: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build-environment: linux-jammy-py3.9-gcc11 +<<<<<<< HEAD docker-image-name: pytorch-linux-jammy-py3.9-gcc11 +======= + docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) test-matrix: | { include: [ { config: "verify_cachebench", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" }, diff --git a/.github/workflows/trymerge.yml b/.github/workflows/trymerge.yml index dc93cd24e19a..a373f93545f5 100644 --- a/.github/workflows/trymerge.yml +++ b/.github/workflows/trymerge.yml @@ -16,13 +16,21 @@ jobs: steps: - name: Checkout repo id: checkout +<<<<<<< HEAD uses: actions/checkout@v3 +======= + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: fetch-depth: 0 token: ${{ secrets.MERGEBOT_TOKEN }} - name: Setup Python +<<<<<<< HEAD uses: actions/setup-python@v4 +======= + uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: python-version: '3.9' check-latest: false @@ -87,7 +95,11 @@ jobs: python3 .github/scripts/comment_on_pr.py "${PR_NUM}" "merge" - name: configure aws credentials +<<<<<<< HEAD uses: aws-actions/configure-aws-credentials@v3 +======= + uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) continue-on-error: true with: role-to-assume: arn:aws:iam::308535385114:role/upload_to_ossci_raw_job_status @@ -96,13 +108,21 @@ jobs: - name: Upload merge record to s3 if: always() continue-on-error: true +<<<<<<< HEAD uses: seemethere/upload-artifact-s3@v5 +======= + uses: seemethere/upload-artifact-s3@baba72d0712b404f646cebe0730933554ebce96a # v5.1.0 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: s3-bucket: ossci-raw-job-status s3-prefix: merges/${{ github.repository }}/${{ github.event.client_payload.pr_num }}/${{ github.event.client_payload.comment_id }}/${{ github.run_id }} path: merge_record.json +<<<<<<< HEAD # We want newer merge commands to supercede old ones +======= +# We want newer merge commands to supersede old ones +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) concurrency: group: try-merge-${{ github.event.client_payload.pr_num }} cancel-in-progress: true diff --git a/.github/workflows/tryrebase.yml b/.github/workflows/tryrebase.yml index f6039c59245d..a54b03aa80ef 100644 --- a/.github/workflows/tryrebase.yml +++ b/.github/workflows/tryrebase.yml @@ -13,13 +13,21 @@ jobs: steps: - name: Checkout repo id: checkout +<<<<<<< HEAD uses: actions/checkout@v2 +======= + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: fetch-depth: 0 token: ${{ secrets.MERGEBOT_TOKEN }} - name: Setup Python +<<<<<<< HEAD uses: actions/setup-python@v4 +======= + uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: python-version: '3.9' architecture: x64 diff --git a/.github/workflows/unstable.yml b/.github/workflows/unstable.yml index 13e189234cfe..b4765f067bda 100644 --- a/.github/workflows/unstable.yml +++ b/.github/workflows/unstable.yml @@ -44,7 +44,11 @@ jobs: get-label-type: name: get-label-type +<<<<<<< HEAD uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.7 +======= + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }} with: triggering_actor: ${{ github.triggering_actor }} diff --git a/.github/workflows/update-viablestrict.yml b/.github/workflows/update-viablestrict.yml index a326f4db5b45..960ce696d502 100644 --- a/.github/workflows/update-viablestrict.yml +++ b/.github/workflows/update-viablestrict.yml @@ -18,7 +18,11 @@ jobs: environment: ${{ (github.event_name == 'schedule') && 'mergebot' || '' }} steps: - name: Update viable/strict +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/update-viablestrict@release/2.7 +======= + uses: pytorch/test-infra/.github/actions/update-viablestrict@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) id: update_viablestrict with: repository: pytorch/pytorch @@ -30,7 +34,11 @@ jobs: clickhouse-password: ${{ secrets.CLICKHOUSE_VIABLESTRICT_PASSWORD }} - name: Authenticate to AWS with OIDC +<<<<<<< HEAD uses: aws-actions/configure-aws-credentials@v4 +======= + uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: role-to-assume: arn:aws:iam::308535385114:role/upload_to_ossci_raw_job_status aws-region: us-east-1 diff --git a/.github/workflows/update_pytorch_labels.yml b/.github/workflows/update_pytorch_labels.yml index 68b41c626035..c1fc945bf940 100644 --- a/.github/workflows/update_pytorch_labels.yml +++ b/.github/workflows/update_pytorch_labels.yml @@ -17,13 +17,21 @@ jobs: contents: read steps: - name: Checkout PyTorch +<<<<<<< HEAD uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.7 +======= + uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: fetch-depth: 1 submodules: false - name: configure aws credentials id: aws_creds +<<<<<<< HEAD uses: aws-actions/configure-aws-credentials@v4 +======= + uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_update_pytorch_labels aws-region: us-east-1 diff --git a/.github/workflows/upload-test-stats-while-running.yml b/.github/workflows/upload-test-stats-while-running.yml index 938edd11b9ec..15d907a6f9a8 100644 --- a/.github/workflows/upload-test-stats-while-running.yml +++ b/.github/workflows/upload-test-stats-while-running.yml @@ -16,7 +16,11 @@ jobs: runs-on: linux.2xlarge steps: - name: Checkout PyTorch +<<<<<<< HEAD uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.7 +======= + uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: fetch-depth: 1 submodules: false @@ -24,6 +28,7 @@ jobs: - name: Setup Linux uses: ./.github/actions/setup-linux +<<<<<<< HEAD - name: Setup miniconda uses: pytorch/test-infra/.github/actions/setup-miniconda@release/2.7 with: @@ -32,9 +37,18 @@ jobs: - name: Install requirements run: | ${CONDA_RUN} pip install requests==2.32.2 boto3==1.35.42 +======= + - name: Install requirements + run: | + python3 -m pip install requests==2.32.2 boto3==1.35.42 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: Upload test stats env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} run: | +<<<<<<< HEAD ${CONDA_RUN} python -m tools.stats.upload_test_stats_running_jobs +======= + python3 -m tools.stats.upload_test_stats_running_jobs +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) diff --git a/.github/workflows/upload-test-stats.yml b/.github/workflows/upload-test-stats.yml index c7c2acbb9c46..2f146b66b3df 100644 --- a/.github/workflows/upload-test-stats.yml +++ b/.github/workflows/upload-test-stats.yml @@ -2,7 +2,29 @@ name: Upload test stats on: workflow_run: +<<<<<<< HEAD workflows: [pull, trunk, periodic, inductor, unstable, slow, unstable-periodic, inductor-periodic, rocm, rocm-mi300, inductor-micro-benchmark, inductor-micro-benchmark-x86, inductor-cu124, inductor-rocm, inductor-rocm-mi300, mac-mps] +======= + workflows: + - pull + - trunk + - periodic + - periodic-rocm-mi300 + - inductor + - unstable + - slow + - unstable-periodic + - inductor-periodic + - rocm + - rocm-mi300 + - inductor-micro-benchmark + - inductor-micro-benchmark-x86 + - inductor-cu124 + - inductor-rocm + - inductor-rocm-mi300 + - mac-mps + - linux-aarch64 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) types: - completed @@ -17,7 +39,11 @@ jobs: steps: - name: Get workflow run conclusion # TODO (huydhn): Pin this once https://github.com/octokit/request-action/issues/315 is resolved +<<<<<<< HEAD uses: octokit/request-action@release/2.7 +======= + uses: octokit/request-action@05a2312de9f8207044c4c9e41fe19703986acc13 # v2.x +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) id: get_conclusion with: route: GET /repos/${{ github.repository }}/actions/runs/${{ github.event.workflow_run.id }}/attempts/${{ github.event.workflow_run.run_attempt }} @@ -39,16 +65,27 @@ jobs: run: echo "${TRIGGERING_WORKFLOW}" - name: Checkout PyTorch +<<<<<<< HEAD uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.7 - name: Configure aws credentials uses: aws-actions/configure-aws-credentials@v3 +======= + uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.8 + + - name: Configure aws credentials + uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) continue-on-error: true with: role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_upload-torch-test-stats aws-region: us-east-1 +<<<<<<< HEAD - uses: actions/setup-python@v4 +======= + - uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: python-version: '3.11' cache: pip diff --git a/.github/workflows/upload-torch-dynamo-perf-stats.yml b/.github/workflows/upload-torch-dynamo-perf-stats.yml index d9979b2dcaf0..f2056850dee2 100644 --- a/.github/workflows/upload-torch-dynamo-perf-stats.yml +++ b/.github/workflows/upload-torch-dynamo-perf-stats.yml @@ -2,7 +2,11 @@ name: Upload torch dynamo performance stats on: workflow_run: +<<<<<<< HEAD workflows: [inductor-A100-perf-nightly, inductor-perf-nightly-A10g, inductor-perf-nightly-aarch64, inductor-perf-nightly-x86, perf-nightly-macos, inductor-perf-nightly-rocm, inductor-perf-nightly-h100] +======= + workflows: [inductor-A100-perf-nightly, inductor-perf-nightly-A10g, inductor-perf-nightly-aarch64, inductor-perf-nightly-x86, inductor-perf-nightly-macos, inductor-perf-nightly-rocm, inductor-perf-nightly-h100] +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) types: - completed @@ -14,7 +18,11 @@ jobs: steps: - name: Get workflow run conclusion # TODO (huydhn): Pin this once https://github.com/octokit/request-action/issues/315 is resolved +<<<<<<< HEAD uses: octokit/request-action@release/2.7 +======= + uses: octokit/request-action@05a2312de9f8207044c4c9e41fe19703986acc13 # v2.x +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) id: get-conclusion with: route: GET /repos/${{ github.repository }}/actions/runs/${{ github.event.workflow_run.id }}/attempts/${{ github.event.workflow_run.run_attempt }} @@ -32,19 +40,31 @@ jobs: name: Upload dynamo performance stats for ${{ github.event.workflow_run.id }}, attempt ${{ github.event.workflow_run.run_attempt }} steps: - name: Checkout PyTorch +<<<<<<< HEAD uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.7 +======= + uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: submodules: false fetch-depth: 1 - name: Configure aws credentials +<<<<<<< HEAD uses: aws-actions/configure-aws-credentials@v3 +======= + uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) continue-on-error: true with: role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_upload-torch-test-stats aws-region: us-east-1 +<<<<<<< HEAD - uses: actions/setup-python@v4 +======= + - uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: python-version: '3.11' cache: pip diff --git a/.github/workflows/upload_test_stats_intermediate.yml b/.github/workflows/upload_test_stats_intermediate.yml index e8958ea8b651..a3dad9a8da48 100644 --- a/.github/workflows/upload_test_stats_intermediate.yml +++ b/.github/workflows/upload_test_stats_intermediate.yml @@ -17,12 +17,20 @@ jobs: environment: upload-stats steps: - name: Checkout PyTorch +<<<<<<< HEAD uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.7 +======= + uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: fetch-depth: 1 submodules: false +<<<<<<< HEAD - uses: actions/setup-python@v4 +======= + - uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: python-version: '3.11' cache: pip @@ -31,7 +39,11 @@ jobs: pip3 install requests==2.32.2 boto3==1.35.42 - name: Authenticate with AWS +<<<<<<< HEAD uses: aws-actions/configure-aws-credentials@v4 +======= + uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: role-to-assume: arn:aws:iam::308535385114:role/gha_upload_test_stats_intermediate_workflow aws-region: us-east-1 diff --git a/.github/workflows/weekly.yml b/.github/workflows/weekly.yml index 84b2f2f2a122..e1175cd6103b 100644 --- a/.github/workflows/weekly.yml +++ b/.github/workflows/weekly.yml @@ -17,12 +17,20 @@ jobs: environment: update-commit-hash steps: - name: Checkout repo +<<<<<<< HEAD uses: actions/checkout@v3 +======= + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: fetch-depth: 0 - name: update-xla-commit-hash continue-on-error: true +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/update-commit-hash@release/2.7 +======= + uses: pytorch/test-infra/.github/actions/update-commit-hash@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: repo-name: xla branch: master @@ -37,12 +45,20 @@ jobs: environment: update-commit-hash steps: - name: Checkout repo +<<<<<<< HEAD uses: actions/checkout@v3 +======= + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: fetch-depth: 0 token: ${{ secrets.UPDATEBOT_TOKEN }} - name: Setup Python +<<<<<<< HEAD uses: actions/setup-python@v2 +======= + uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: python-version: '3.9' - name: Install requirements diff --git a/.github/workflows/xpu.yml b/.github/workflows/xpu.yml index c5a420f3b243..b118a21358f1 100644 --- a/.github/workflows/xpu.yml +++ b/.github/workflows/xpu.yml @@ -15,7 +15,11 @@ jobs: get-label-type: if: github.repository_owner == 'pytorch' name: get-label-type +<<<<<<< HEAD uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.7 +======= + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: triggering_actor: ${{ github.triggering_actor }} issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} @@ -30,6 +34,7 @@ jobs: sync-tag: linux-xpu-2025-0-build runner_prefix: ${{ needs.get-label-type.outputs.label-type }} build-environment: linux-jammy-xpu-2025.0-py3.9 +<<<<<<< HEAD docker-image-name: pytorch-linux-jammy-xpu-2025.0-py3 runner: linux.12xlarge test-matrix: | @@ -45,13 +50,59 @@ jobs: name: linux-jammy-xpu-2025.0-py3.9 uses: ./.github/workflows/_xpu-test.yml needs: linux-jammy-xpu-2025_0-py3_9-build +======= + docker-image-name: ci-image:pytorch-linux-jammy-xpu-2025.0-py3 + runner: linux.12xlarge + test-matrix: | + { include: [ + { config: "default", shard: 1, num_shards: 6, runner: "linux.idc.xpu" }, + { config: "default", shard: 2, num_shards: 6, runner: "linux.idc.xpu" }, + { config: "default", shard: 3, num_shards: 6, runner: "linux.idc.xpu" }, + { config: "default", shard: 4, num_shards: 6, runner: "linux.idc.xpu" }, + { config: "default", shard: 5, num_shards: 6, runner: "linux.idc.xpu" }, + { config: "default", shard: 6, num_shards: 6, runner: "linux.idc.xpu" }, + ]} + secrets: inherit + + linux-jammy-xpu-2025_1-py3_9-build: + name: linux-jammy-xpu-2025.1-py3.9 + uses: ./.github/workflows/_linux-build.yml + needs: get-label-type + with: + sync-tag: linux-xpu-2025-1-build + runner_prefix: ${{ needs.get-label-type.outputs.label-type }} + build-environment: linux-jammy-xpu-2025.1-py3.9 + docker-image-name: ci-image:pytorch-linux-jammy-xpu-2025.1-py3 + runner: linux.12xlarge + test-matrix: | + { include: [ + { config: "default", shard: 1, num_shards: 6, runner: "linux.idc.xpu" }, + { config: "default", shard: 2, num_shards: 6, runner: "linux.idc.xpu" }, + { config: "default", shard: 3, num_shards: 6, runner: "linux.idc.xpu" }, + { config: "default", shard: 4, num_shards: 6, runner: "linux.idc.xpu" }, + { config: "default", shard: 5, num_shards: 6, runner: "linux.idc.xpu" }, + { config: "default", shard: 6, num_shards: 6, runner: "linux.idc.xpu" }, + ]} + secrets: inherit + + linux-jammy-xpu-2025_1-py3_9-test: + name: linux-jammy-xpu-2025.1-py3.9 + uses: ./.github/workflows/_xpu-test.yml + needs: linux-jammy-xpu-2025_1-py3_9-build +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) permissions: id-token: write contents: read with: +<<<<<<< HEAD build-environment: linux-jammy-xpu-2025.0-py3.9 docker-image: ${{ needs.linux-jammy-xpu-2025_0-py3_9-build.outputs.docker-image }} test-matrix: ${{ needs.linux-jammy-xpu-2025_0-py3_9-build.outputs.test-matrix }} +======= + build-environment: linux-jammy-xpu-2025.1-py3.9 + docker-image: ${{ needs.linux-jammy-xpu-2025_1-py3_9-build.outputs.docker-image }} + test-matrix: ${{ needs.linux-jammy-xpu-2025_1-py3_9-build.outputs.test-matrix }} +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) secrets: inherit windows-xpu-2025_0-build: @@ -65,3 +116,18 @@ jobs: xpu-version: '2025.0' vc-year: '2022' secrets: inherit +<<<<<<< HEAD +======= + + windows-xpu-2025_1-build: + if: github.repository_owner == 'pytorch' + name: win-vs2022-xpu-2025_1-py3 + uses: ./.github/workflows/_win-build.yml + with: + build-environment: win-vs2022-xpu-py3 + cuda-version: cpu + use-xpu: true + xpu-version: '2025.1' + vc-year: '2022' + secrets: inherit +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) diff --git a/.gitignore b/.gitignore index 7557c564a6de..cdd9c8cfb4d1 100644 --- a/.gitignore +++ b/.gitignore @@ -47,6 +47,10 @@ docs/source/generated/ docs/source/compile/generated/ log usage_log.txt +<<<<<<< HEAD +======= +usage_log* +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) test-reports/ test/*.bak test/**/*.bak @@ -62,9 +66,13 @@ test/forward_backward_compatibility/nightly_schemas.txt dropout_model.pt test/generated_type_hints_smoketest.py test/htmlcov +<<<<<<< HEAD test/cpp_extensions/install/ test/cpp_extensions/open_registration_extension/install test/cpp_extensions/libtorch_agnostic_extension/install +======= +test/cpp_extensions/**/install +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) test/kernel.errors.txt third_party/build/ third_party/nccl/ @@ -180,6 +188,10 @@ compile_commands.json *.egg-info/ docs/source/scripts/activation_images/ docs/source/scripts/quantization_backend_configs/ +<<<<<<< HEAD +======= +docs/source/scripts/lr_scheduler_images/ +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ## General @@ -213,6 +225,7 @@ docs/source/scripts/quantization_backend_configs/ # Compiled MATLAB *.mex* +<<<<<<< HEAD # IPython notebook checkpoints .ipynb_checkpoints @@ -222,6 +235,8 @@ docs/source/scripts/quantization_backend_configs/ *.swp *~ +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # NFS handle files **/.nfs* @@ -391,3 +406,9 @@ android/pytorch_android_torchvision/.cxx .arcconfig .stable_pyre_client .pyre_client +<<<<<<< HEAD +======= + +# Claude Code local configuration +CLAUDE.local.md +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) diff --git a/.gitmodules b/.gitmodules index 3408fb8a87c5..2aec6224669a 100644 --- a/.gitmodules +++ b/.gitmodules @@ -2,10 +2,13 @@ ignore = dirty path = third_party/pybind11 url = https://github.com/pybind/pybind11.git +<<<<<<< HEAD [submodule "third_party/eigen"] ignore = dirty path = third_party/eigen url = https://gitlab.com/libeigen/eigen.git +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) [submodule "third_party/googletest"] ignore = dirty path = third_party/googletest @@ -25,7 +28,11 @@ [submodule "third_party/gloo"] ignore = dirty path = third_party/gloo +<<<<<<< HEAD url = https://github.com/facebookincubator/gloo +======= + url = https://github.com/pytorch/gloo +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) [submodule "third_party/NNPACK_deps/pthreadpool"] ignore = dirty path = third_party/pthreadpool diff --git a/.lintrunner.toml b/.lintrunner.toml index 17163c016b24..c219b76cb535 100644 --- a/.lintrunner.toml +++ b/.lintrunner.toml @@ -18,6 +18,11 @@ exclude_patterns = [ 'torch/_inductor/autoheuristic/artifacts/**', 'scripts/**', 'test/generated_type_hints_smoketest.py', +<<<<<<< HEAD +======= + # CPython tests + 'test/dynamo/cpython/**', +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # Tests from the NumPy test suite 'test/torch_np/numpy_test/**/*.py', 'third_party/**', @@ -55,12 +60,20 @@ init_command = [ code = 'CLANGFORMAT' include_patterns = [ 'aten/src/ATen/*.h', +<<<<<<< HEAD +======= + 'aten/src/ATen/cpu/vec/**/*.h', +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) 'aten/src/ATen/mps/**/*.mm', 'aten/src/ATen/mps/**/*.h', 'aten/src/ATen/xpu/**/*.h', 'aten/src/ATen/xpu/**/*.cpp', 'aten/src/ATen/core/boxing/**/*.h', 'aten/src/ATen/core/dispatch/**/*.h', +<<<<<<< HEAD +======= + 'aten/src/ATen/core/Formatting.cpp', +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) 'aten/src/ATen/native/mps/**/*.metal', 'aten/src/ATen/native/mps/**/*.mm', 'aten/src/ATen/native/mps/**/*.h', @@ -81,6 +94,12 @@ include_patterns = [ 'torch/csrc/**/*.h', 'torch/csrc/**/*.hpp', 'torch/csrc/**/*.cpp', +<<<<<<< HEAD +======= + 'torch/nativert/**/*.h', + 'torch/nativert/**/*.cpp', + 'torch/headeronly/**/*.h', +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) 'test/cpp/**/*.h', 'test/cpp/**/*.cpp', ] @@ -147,6 +166,7 @@ init_command = [ 'numpy==1.26.4 ; python_version >= "3.9" and python_version <= "3.11"', 'numpy==2.1.0 ; python_version >= "3.12"', 'expecttest==0.3.0', +<<<<<<< HEAD 'mypy==1.14.0', 'sympy==1.13.3', 'types-requests==2.27.25', @@ -155,13 +175,27 @@ init_command = [ 'types-protobuf==3.19.18', 'types-pkg-resources==0.1.3', 'types-Jinja2==2.11.9', +======= + 'mypy==1.16.0', + 'sympy==1.13.3', + 'types-requests==2.27.25', + 'types-pyyaml==6.0.1', + 'types-tabulate==0.8.8', + 'types-protobuf==5.29.1.20250403', + 'types-setuptools==79.0.0.20250422', + 'types-jinja2==2.11.9', +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) 'types-colorama==0.4.6', 'filelock==3.13.1', 'junitparser==2.1.1', 'rich==10.9.0', 'pyyaml==6.0.1', 'optree==0.13.0', +<<<<<<< HEAD 'dataclasses_json==0.6.7', +======= + 'dataclasses-json==0.6.7', +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) 'pandas==2.2.3', ] @@ -223,12 +257,24 @@ include_patterns = [ 'c10/**/*.cpp', 'c10/**/*.h', 'torch/*.h', +<<<<<<< HEAD +======= + 'torch/_inductor/codegen/aoti_runtime/interface.cpp', +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) 'torch/csrc/*.h', 'torch/csrc/*.cpp', 'torch/csrc/**/*.h', 'torch/csrc/**/*.cpp', 'torch/csrc/jit/serialization/*.h', 'torch/csrc/jit/serialization/*.cpp', +<<<<<<< HEAD +======= + 'torch/nativert/*.h', + 'torch/nativert/*.cpp', + 'torch/nativert/**/*.h', + 'torch/nativert/**/*.cpp', + 'torch/headeronly/**/*.h', +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ] exclude_patterns = [ # The negative filters below are to exclude files that include onnx_pb.h or @@ -271,6 +317,10 @@ exclude_patterns = [ 'torch/csrc/utils/generated_serialization_types.h', 'torch/csrc/utils/pythoncapi_compat.h', 'torch/csrc/inductor/aoti_runtime/sycl_runtime_wrappers.h', +<<<<<<< HEAD +======= + 'aten/src/ATen/ExpandBase.h', +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ] init_command = [ 'python3', @@ -366,7 +416,11 @@ init_command = [ 'python3', 'tools/linter/adapters/pip_init.py', '--dry-run={{DRYRUN}}', +<<<<<<< HEAD 'ruamel.yaml==0.17.4', +======= + 'ruamel.yaml==0.18.10', +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ] is_formatter = true @@ -379,6 +433,15 @@ command = [ '--', '@{{PATHSFILE}}' ] +<<<<<<< HEAD +======= +init_command = [ + 'python3', + 'tools/linter/adapters/pip_init.py', + '--dry-run={{DRYRUN}}', + 'ruamel.yaml==0.18.10', +] +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) [[linter]] code = 'NEWLINE' @@ -396,8 +459,15 @@ exclude_patterns=[ 'tools/clang_format_hash/**', 'test/cpp/jit/upgrader_models/*.ptl', 'test/cpp/jit/upgrader_models/*.ptl.ff', +<<<<<<< HEAD + '**/*.png', + '**/*.gz', +======= + 'test/dynamo/cpython/**', '**/*.png', '**/*.gz', + '**/*.patch', +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ] command = [ 'python3', @@ -452,7 +522,11 @@ exclude_patterns = [ 'test/cpp/jit/upgrader_models/*.ptl.ff', '.ci/docker/common/install_rocm_drm.sh', '.lintrunner.toml', +<<<<<<< HEAD '.ci/magma/package_files/*.patch', +======= + '**/*.patch', +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ] command = [ 'python3', @@ -521,6 +595,10 @@ include_patterns = [ 'c10/**', 'aten/**', 'torch/csrc/**', +<<<<<<< HEAD +======= + 'torch/nativert/**', +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ] exclude_patterns = [ 'aten/src/ATen/native/quantized/cpu/qnnpack/**', @@ -748,6 +826,10 @@ include_patterns = [ 'aten/**', 'c10/**', 'torch/csrc/**', +<<<<<<< HEAD +======= + 'torch/nativert/**', +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ] exclude_patterns = [ 'aten/src/ATen/cuda/CUDAContext.cpp', @@ -933,6 +1015,10 @@ include_patterns = [ exclude_patterns = [ 'test/run_test.py', '**/fb/**', +<<<<<<< HEAD +======= + 'test/dynamo/cpython/3.13/**', +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) 'test/quantization/**', # should be run through test/test_quantization.py 'test/jit/**', # should be run through test/test_jit.py 'test/ao/sparsity/**', # should be run through test/test_ao_sparsity.py @@ -1001,6 +1087,10 @@ include_patterns = [ 'c10/**', 'aten/**', 'torch/csrc/**', +<<<<<<< HEAD +======= + 'torch/nativert/**', +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ] exclude_patterns = [ 'c10/util/CallOnce.h', @@ -1045,6 +1135,10 @@ include_patterns = [ 'c10/**', 'aten/**', 'torch/csrc/**', +<<<<<<< HEAD +======= + 'torch/nativert/**', +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ] exclude_patterns = [ '**/fb/**', @@ -1107,6 +1201,63 @@ init_command = [ 'PyYAML==6.0.1', ] +<<<<<<< HEAD +======= +[[linter]] +code = 'CODESPELL' +command = [ + 'python3', + 'tools/linter/adapters/codespell_linter.py', + '--', + '@{{PATHSFILE}}' +] +include_patterns = [ + '**', +] +exclude_patterns = [ + # We don't care too much about files in this directory, don't enforce + # spelling on them + 'caffe2/**', + 'fb/**', + '**/fb/**', + 'third_party/**', + 'test/dynamo/cpython/**', + 'torch/_vendor/**', + 'torch/_inductor/fx_passes/serialized_patterns/**', + 'torch/_inductor/autoheuristic/artifacts/**', + # These files are all grandfathered in, feel free to remove from this list + # as necessary + # NOTE: remove the patterns in the order they are listed + 'aten/**', + 'aten/src/ATen/native/**', + 'aten/src/ATen/native/q*/**', + 'aten/src/ATen/native/[a-pA-P]*/**', + 'aten/src/ATen/[a-mA-M]*/**', + 'test/**', + 'test/test_*', + 'test/[a-hA-h]*/**', + 'test/inductor/**', + 'test/dynamo/**', + 'test/distributed/**', + 'torch/**', + 'torch/_*/**', + 'torch/ao/**', + 'torch/fx/**', + 'torch/distributed/tensor/**', + 'torch/[j-o]*/**', + 'torch/utils/**', + 'torch/csrc/jit/**', + 'torch/csrc/jit/[a-o]*/**', +] +init_command = [ + 'python3', + 'tools/linter/adapters/pip_init.py', + '--dry-run={{DRYRUN}}', + 'codespell[toml]==2.4.1', +] +is_formatter = true + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # usort + ruff-format [[linter]] code = 'PYFMT' @@ -1128,6 +1279,10 @@ exclude_patterns = [ 'caffe2/**/*.pyi', 'fb/**', '**/fb/**', +<<<<<<< HEAD +======= + 'test/dynamo/cpython/**', +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) 'third_party/**/*.py', 'third_party/**/*.pyi', 'torch/_vendor/**', @@ -1135,12 +1290,15 @@ exclude_patterns = [ 'torch/_inductor/autoheuristic/artifacts/**', # These files are all grandfathered in, feel free to remove from this list # as necessary +<<<<<<< HEAD 'test/_nvfuser/__init__.py', 'test/_nvfuser/test_dynamo.py', 'test/_nvfuser/test_python_frontend.py', 'test/_nvfuser/test_torchscript.py', 'test/delete.py', 'test/expect/__init__.py', +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) 'test/quantization/__init__.py', 'test/quantization/core/__init__.py', 'test/quantization/core/experimental/apot_fx_graph_mode_ptq.py', @@ -1162,6 +1320,7 @@ exclude_patterns = [ 'test/quantization/core/test_utils.py', 'test/quantization/core/test_workflow_module.py', 'test/quantization/core/test_workflow_ops.py', +<<<<<<< HEAD 'test/quantization/eager/__init__.py', 'test/quantization/eager/test_bias_correction_eager.py', 'test/quantization/eager/test_equalize_eager.py', @@ -1170,14 +1329,19 @@ exclude_patterns = [ 'test/quantization/eager/test_numeric_suite_eager.py', 'test/quantization/eager/test_quantize_eager_ptq.py', 'test/quantization/eager/test_quantize_eager_qat.py', +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) 'test/quantization/fx/__init__.py', 'test/quantization/fx/test_equalize_fx.py', 'test/quantization/fx/test_model_report_fx.py', 'test/quantization/fx/test_numeric_suite_fx.py', 'test/quantization/fx/test_quantize_fx.py', 'test/quantization/fx/test_subgraph_rewriter.py', +<<<<<<< HEAD 'test/test_fake_tensor.py', 'test/test_flop_counter.py', +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) 'test/test_function_schema.py', 'test/test_functional_autograd_benchmark.py', 'test/test_functional_optim.py', @@ -1255,10 +1419,13 @@ exclude_patterns = [ 'test/test_unary_ufuncs.py', 'test/test_vulkan.py', 'torch/_awaits/__init__.py', +<<<<<<< HEAD 'torch/_custom_op/__init__.py', 'torch/_custom_op/autograd.py', 'torch/_custom_op/functional.py', 'torch/_custom_op/impl.py', +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) 'torch/_export/__init__.py', 'torch/_export/constraints.py', 'torch/_export/db/__init__.py', @@ -1296,6 +1463,7 @@ exclude_patterns = [ 'torch/_export/db/examples/type_reflection_method.py', 'torch/_export/db/gen_example.py', 'torch/_export/db/logging.py', +<<<<<<< HEAD 'torch/_export/error.py', 'torch/_export/exported_program.py', 'torch/_export/pass_base.py', @@ -1314,6 +1482,8 @@ exclude_patterns = [ 'torch/_export/serde/upgrade.py', 'torch/_export/trace.py', 'torch/_export/verifier.py', +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) 'torch/testing/_internal/__init__.py', 'torch/testing/_internal/autocast_test_lists.py', 'torch/testing/_internal/autograd_function_db.py', @@ -1321,7 +1491,10 @@ exclude_patterns = [ 'torch/testing/_internal/codegen/__init__.py', 'torch/testing/_internal/codegen/random_topo_test.py', 'torch/testing/_internal/common_cuda.py', +<<<<<<< HEAD 'torch/testing/_internal/common_distributed.py', +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) 'torch/testing/_internal/common_jit.py', 'torch/testing/_internal/common_methods_invocations.py', 'torch/testing/_internal/common_modules.py', @@ -1338,6 +1511,7 @@ exclude_patterns = [ 'torch/testing/_internal/data/network1.py', 'torch/testing/_internal/data/network2.py', 'torch/testing/_internal/dist_utils.py', +<<<<<<< HEAD 'torch/testing/_internal/distributed/__init__.py', 'torch/testing/_internal/distributed/_shard/__init__.py', 'torch/testing/_internal/distributed/_shard/sharded_tensor/__init__.py', @@ -1370,6 +1544,8 @@ exclude_patterns = [ 'torch/testing/_internal/distributed/rpc/rpc_test.py', 'torch/testing/_internal/distributed/rpc/tensorpipe_rpc_agent_test_fixture.py', 'torch/testing/_internal/distributed/rpc_utils.py', +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) 'torch/testing/_internal/generated/__init__.py', 'torch/testing/_internal/hypothesis_utils.py', 'torch/testing/_internal/inductor_utils.py', @@ -1386,6 +1562,7 @@ exclude_patterns = [ 'torch/testing/_internal/test_module/__init__.py', 'torch/testing/_internal/test_module/future_div.py', 'torch/testing/_internal/test_module/no_future_div.py', +<<<<<<< HEAD 'torch/utils/_contextlib.py', 'torch/utils/_cpp_extension_versioner.py', 'torch/utils/_crash_handler.py', @@ -1405,6 +1582,11 @@ exclude_patterns = [ 'torch/utils/benchmark/examples/blas_compare_setup.py', 'torch/utils/benchmark/examples/compare.py', 'torch/utils/benchmark/examples/end_to_end.py', +======= + 'torch/utils/benchmark/__init__.py', + 'torch/utils/benchmark/examples/__init__.py', + 'torch/utils/benchmark/examples/compare.py', +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) 'torch/utils/benchmark/examples/fuzzer.py', 'torch/utils/benchmark/examples/op_benchmark.py', 'torch/utils/benchmark/examples/simple_timeit.py', @@ -1466,7 +1648,10 @@ exclude_patterns = [ 'torch/utils/throughput_benchmark.py', 'torch/utils/viz/__init__.py', 'torch/utils/viz/_cycles.py', +<<<<<<< HEAD 'torch/utils/weak.py', +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ] init_command = [ 'python3', @@ -1475,8 +1660,13 @@ init_command = [ '--no-black-binary', 'black==23.12.1', 'usort==1.0.8.post1', +<<<<<<< HEAD 'isort==5.13.2', 'ruff==0.9.8', # sync with RUFF +======= + 'isort==6.0.1', + 'ruff==0.11.13', # sync with RUFF +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ] is_formatter = true @@ -1537,13 +1727,26 @@ command = [ [[linter]] code = 'RUFF' +<<<<<<< HEAD include_patterns = ['**/*.py', '**/*.pyi'] +======= +include_patterns = [ + '**/*.py', + '**/*.pyi', + '**/*.ipynb', + 'pyproject.toml', +] +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) exclude_patterns = [ 'caffe2/**', 'functorch/docs/**', 'functorch/notebooks/**', 'torch/_inductor/fx_passes/serialized_patterns/**', 'torch/_inductor/autoheuristic/artifacts/**', +<<<<<<< HEAD +======= + 'test/dynamo/cpython/**', +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) 'scripts/**', 'third_party/**', 'fb/**', @@ -1561,11 +1764,19 @@ init_command = [ 'python3', 'tools/linter/adapters/pip_init.py', '--dry-run={{DRYRUN}}', +<<<<<<< HEAD 'ruff==0.9.8', # sync with PYFMT ] is_formatter = true # This linter prevents merge conlicts in csv files in pytorch by enforcing +======= + 'ruff==0.11.13', # sync with PYFMT +] +is_formatter = true + +# This linter prevents merge conflicts in csv files in pytorch by enforcing +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # three lines of whitespace between entries such that unless people are modifying # the same line, merge conflicts should not arise in git or hg [[linter]] @@ -1720,7 +1931,11 @@ command = [ '@{{PATHSFILE}}' ] include_patterns = [ +<<<<<<< HEAD 'torch/**/not-exist.py' +======= + 'torch/_inductor/**/*.py' +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ] is_formatter = false @@ -1737,3 +1952,30 @@ include_patterns = [ 'torch/_dynamo/**', ] is_formatter = false +<<<<<<< HEAD +======= + +[[linter]] +code = 'TEST_DEVICE_BIAS' +command = [ + 'python3', + 'tools/linter/adapters/test_device_bias_linter.py', + '--', + '@{{PATHSFILE}}', +] +include_patterns = [ + 'test/**/test_*.py', +] + +# 'header_only_linter' reports on properly testing header-only APIs. +[[linter]] +code = 'HEADER_ONLY_LINTER' +command = [ + 'python3', + 'tools/linter/adapters/header_only_linter.py', +] +include_patterns = [ + 'torch/header_only_apis.txt', +] +is_formatter = false +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) diff --git a/.vscode/extensions.json b/.vscode/extensions.json index 9b22ad8d65e5..b8207d4cd459 100644 --- a/.vscode/extensions.json +++ b/.vscode/extensions.json @@ -1,5 +1,17 @@ { +<<<<<<< HEAD "recommendations": [ "ms-python.python", ] +======= + "recommendations": [ + "ms-python.python", + "charliermarsh.ruff", + "ms-python.flake8", + "ms-python.mypy-type-checker", + "ms-vscode.cmake-tools", + "EditorConfig.EditorConfig", + "streetsidesoftware.code-spell-checker", + ] +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } diff --git a/.vscode/settings_recommended.json b/.vscode/settings_recommended.json index 551a3ec2a5a3..30a059b15ec3 100644 --- a/.vscode/settings_recommended.json +++ b/.vscode/settings_recommended.json @@ -1,15 +1,67 @@ { +<<<<<<< HEAD "[python]": { "editor.tabSize": 4 }, "files.associations": { "*.py.in": "python", "*.pyi.in": "python" +======= + "files.associations": { + ".clang-format": "yaml", + ".clang-tidy": "yaml", + ".flake8": "ini", + ".coveragerc": "ini", + "*.py.in": "python", + "*.pyi.in": "python", + "*requirements*.txt": "pip-requirements", + "*requirements*.in": "pip-requirements", + "*.cpp.in": "cpp", + "*.h.in": "cpp", + "*.cmake.in": "cmake", + "Makefile.*": "makefile", + "*.Makefile": "makefile", + "BUCK": "starlark", + "BUCK.*": "starlark" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) }, "files.eol": "\n", "files.insertFinalNewline": true, "files.trimFinalNewlines": true, "files.trimTrailingWhitespace": true, +<<<<<<< HEAD "python.linting.enabled": true, "python.linting.flake8Enabled": true +======= + "cmake.preferredGenerators": [ + "Ninja", + "Unix Makefiles" + ], + "cmake.configureEnvironment": { + "CMAKE_EXPORT_COMPILE_COMMANDS": "ON" + }, + "cmake.sourceDirectory": "${workspaceFolder}", + "cmake.buildDirectory": "${workspaceFolder}/build", + "cmake.configureArgs": [ + "-DPython_EXECUTABLE=${workspaceFolder}/venv/bin/python", + "-DPython_ROOT_DIR=${workspaceFolder}/venv" + ], + "[python]": { + "editor.tabSize": 4, + "editor.defaultFormatter": "charliermarsh.ruff" + }, + "python.defaultInterpreterPath": "${workspaceFolder}/venv/bin/python", + "python.analysis.inlayHints.functionReturnTypes": true, + "flake8.importStrategy": "fromEnvironment", + "flake8.args": [ + "--append-config=${workspaceFolder}/.flake8" + ], + "ruff.importStrategy": "fromEnvironment", + "ruff.lineLength": 88, + "ruff.organizeImports": false, + "ruff.configurationPreference": "filesystemFirst", + "mypy-type-checker.importStrategy": "fromEnvironment", + "mypy-type-checker.preferDaemon": true, + "mypy-type-checker.reportingScope": "workspace" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } diff --git a/AGENTS.md b/AGENTS.md new file mode 100644 index 000000000000..daf0f491702b --- /dev/null +++ b/AGENTS.md @@ -0,0 +1 @@ +- This is the only AGENTS.md, there are no recursive AGENTS.md diff --git a/BUILD.bazel b/BUILD.bazel index e848f441541d..9495e554b77b 100644 --- a/BUILD.bazel +++ b/BUILD.bazel @@ -1,4 +1,8 @@ load("@bazel_skylib//lib:paths.bzl", "paths") +<<<<<<< HEAD +======= +load("@com_github_google_flatbuffers//:build_defs.bzl", "flatbuffer_cc_library") +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) load("@pybind11_bazel//:build_defs.bzl", "pybind_extension") load("@rules_cc//cc:defs.bzl", "cc_binary", "cc_library", "cc_test") load("@rules_python//python:defs.bzl", "py_library", "py_test") @@ -289,6 +293,10 @@ header_template_rule( substitutions = { "@AT_CUDNN_ENABLED@": "1", "@AT_CUSPARSELT_ENABLED@": "0", +<<<<<<< HEAD +======= + "@AT_HIPSPARSELT_ENABLED@": "0", +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) "@AT_ROCM_ENABLED@": "0", "@AT_MAGMA_ENABLED@": "0", "@NVCC_FLAGS_EXTRA@": "", @@ -375,6 +383,10 @@ cc_library( ":torch_headers", "@fbgemm", "@ideep", +<<<<<<< HEAD +======= + "@nlohmann", +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ], alwayslink = True, ) @@ -497,7 +509,11 @@ filegroup( # To achieve finer granularity and make debug easier, caffe2 is split into three libraries: # ATen, caffe2 and caffe2_for_aten_headers. ATen lib group up source codes under # aten/ directory and caffe2 contains most files under `caffe2/` directory. Since the +<<<<<<< HEAD # ATen lib and the caffe2 lib would depend on each other, `caffe2_for_aten_headers` is splitted +======= +# ATen lib and the caffe2 lib would depend on each other, `caffe2_for_aten_headers` is split +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # out from `caffe2` to avoid dependency cycle. cc_library( name = "caffe2_for_aten_headers", @@ -579,9 +595,15 @@ cc_library( cu_library( name = "torch_cuda", srcs = [ +<<<<<<< HEAD "torch/csrc/distributed/c10d/intra_node_comm.cu", "torch/csrc/distributed/c10d/NanCheck.cu", "torch/csrc/distributed/c10d/quantization/quantization_gpu.cu", +======= + "torch/csrc/distributed/c10d/NanCheck.cu", + "torch/csrc/distributed/c10d/quantization/quantization_gpu.cu", + "torch/csrc/distributed/c10d/symm_mem/intra_node_comm.cu", +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ], copts = torch_cuda_half_options, visibility = ["//visibility:public"], @@ -659,6 +681,18 @@ cc_library( # torch torch_cuda_headers = glob(["torch/csrc/cuda/*.h"]) +<<<<<<< HEAD +======= +flatbuffer_cc_library( + name = "torch_flatbuffers", + srcs = [ + "torch/csrc/jit/serialization/mobile_bytecode.fbs", + ], + flatc_args = ["--cpp", "--gen-mutable", "--scoped-enums"], + out_prefix = "torch/csrc/jit/serialization/", +) + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) cc_library( name = "torch_headers", hdrs = if_cuda( @@ -672,6 +706,10 @@ cc_library( ], exclude = [ "torch/csrc/*/generated/*.h", +<<<<<<< HEAD +======= + "torch/csrc/jit/serialization/mobile_bytecode_generated.h", +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ] + torch_cuda_headers, ) + GENERATED_AUTOGRAD_CPP + [":version_h"], includes = [ @@ -686,6 +724,10 @@ cc_library( deps = [ ":aten_headers", ":caffe2_headers", +<<<<<<< HEAD +======= + ":torch_flatbuffers", +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) "//c10", "@com_github_google_flatbuffers//:flatbuffers", "@local_config_python//:python_headers", @@ -723,6 +765,7 @@ cc_library( srcs = if_cuda(glob( libtorch_cuda_sources, exclude = [ +<<<<<<< HEAD "torch/csrc/cuda/python_nccl.cpp", "torch/csrc/cuda/nccl.cpp", "torch/csrc/distributed/c10d/intra_node_comm.cu", @@ -731,6 +774,17 @@ cc_library( "torch/csrc/distributed/c10d/cuda/AsyncMM.cu", "torch/csrc/distributed/c10d/NanCheck.cu", "torch/csrc/distributed/c10d/quantization/quantization_gpu.cu", +======= + "torch/csrc/cuda/nccl.cpp", + "torch/csrc/cuda/python_nccl.cpp", + "torch/csrc/distributed/c10d/NanCheck.cu", + "torch/csrc/distributed/c10d/cuda/AsyncMM.cu", + "torch/csrc/distributed/c10d/quantization/quantization_gpu.cu", + "torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory.cu", + "torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryOps.cu", + "torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryUtils.cpp", + "torch/csrc/distributed/c10d/symm_mem/intra_node_comm.cu", +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ], )) + torch_sources, copts = TORCH_COPTS, diff --git a/CITATION.cff b/CITATION.cff index e6de8772cbf2..02e112a96d12 100644 --- a/CITATION.cff +++ b/CITATION.cff @@ -113,4 +113,8 @@ preferred-citation: publisher: name: ACM doi: "10.1145/3620665.3640366" +<<<<<<< HEAD url: "https://pytorch.org/assets/pytorch2-2.pdf" +======= + url: "https://docs.pytorch.org/assets/pytorch2-2.pdf" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) diff --git a/CMakeLists.txt b/CMakeLists.txt index 8f0dcbff73cd..e273b5cb4f5c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,4 +1,8 @@ +<<<<<<< HEAD cmake_minimum_required(VERSION 3.18 FATAL_ERROR) +======= +cmake_minimum_required(VERSION 3.27 FATAL_ERROR) +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # cmake_policy(SET CMP0022 NEW) cmake_policy(SET CMP0023 NEW) # Use compiler ID "AppleClang" instead of "Clang" for XCode. Not setting this @@ -6,6 +10,10 @@ cmake_minimum_required(VERSION 3.18 FATAL_ERROR) # one is detected as "AppleClang". cmake_policy(SET CMP0010 NEW) cmake_policy(SET CMP0025 NEW) +<<<<<<< HEAD +======= +cmake_policy(SET CMP0126 OLD) +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # Enables CMake to set LTO on compilers other than Intel. cmake_policy(SET CMP0069 NEW) @@ -16,6 +24,11 @@ cmake_policy(SET CMP0069 NEW) # we do this (and we don't if cmake is old), but it's nice when it's possible, # and it's possible on our Windows configs. cmake_policy(SET CMP0092 NEW) +<<<<<<< HEAD +======= +# Don't remove the FindCUDA module +cmake_policy(SET CMP0146 OLD) +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # Prohibit in-source builds if(${CMAKE_SOURCE_DIR} STREQUAL ${CMAKE_BINARY_DIR}) @@ -54,21 +67,34 @@ set(CMAKE_C_STANDARD # ---[ Utils include(cmake/public/utils.cmake) +<<<<<<< HEAD # --- [ Check that minimal gcc version is 9.2+ if(CMAKE_COMPILER_IS_GNUCXX AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS 9.2) message( FATAL_ERROR "GCC-9.2 or newer is required to compile PyTorch, but found ${CMAKE_CXX_COMPILER_VERSION}" +======= +# --- [ Check that minimal gcc version is 9.3+ +if(CMAKE_COMPILER_IS_GNUCXX AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS 9.3) + message( + FATAL_ERROR + "GCC-9.3 or newer is required to compile PyTorch, but found ${CMAKE_CXX_COMPILER_VERSION}" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ) endif() # This define is needed to preserve behavior given anticpated changes to # cccl/thrust +<<<<<<< HEAD # https://nvidia.github.io/libcudacxx/standard_api/numerics_library/complex.html +======= +# https://nvidia.github.io/cccl/libcudacxx/standard_api/numerics_library/complex.html +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) string(APPEND CMAKE_CUDA_FLAGS " -DLIBCUDACXX_ENABLE_SIMPLIFIED_COMPLEX_OPERATIONS") if(LINUX) +<<<<<<< HEAD include(cmake/CheckAbi.cmake) string(APPEND CMAKE_CXX_FLAGS " -D_GLIBCXX_USE_CXX11_ABI=${GLIBCXX_USE_CXX11_ABI}") @@ -82,6 +108,9 @@ if(LINUX) # compiled by the same toolchain again append_cxx_flag_if_supported("-fabi-version=11" CMAKE_CXX_FLAGS) endif() +======= + set(CXX_STANDARD_REQUIRED ON) +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) endif() set(CMAKE_EXPORT_COMPILE_COMMANDS ON) @@ -271,6 +300,11 @@ option(USE_NATIVE_ARCH "Use -march=native" OFF) cmake_dependent_option(USE_MPS "Use MPS for macOS build" ON "MPS_FOUND" OFF) cmake_dependent_option(USE_NCCL "Use NCCL" ON "USE_CUDA OR USE_ROCM;UNIX;NOT APPLE" OFF) +<<<<<<< HEAD +======= +cmake_dependent_option(USE_XCCL "Use XCCL" ON + "USE_XPU;UNIX;NOT APPLE" OFF) +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) cmake_dependent_option(USE_RCCL "Use RCCL" ON USE_NCCL OFF) cmake_dependent_option(USE_STATIC_NCCL "Use static NCCL" OFF "USE_NCCL" OFF) cmake_dependent_option(USE_SYSTEM_NCCL "Use system-wide NCCL" OFF "USE_NCCL" @@ -344,10 +378,21 @@ cmake_dependent_option( USE_GLOO_WITH_OPENSSL "Use Gloo with OpenSSL. Only available if USE_GLOO is on." OFF "USE_GLOO AND LINUX AND NOT INTERN_BUILD_MOBILE" OFF) cmake_dependent_option( +<<<<<<< HEAD +======= + USE_GLOO_IBVERBS "Use Gloo with ibverbs backend. Only available if USE_GLOO is on." OFF + "USE_GLOO AND LINUX AND NOT INTERN_BUILD_MOBILE" OFF) +cmake_dependent_option( +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) USE_C10D_GLOO "USE C10D GLOO" ON "USE_DISTRIBUTED;USE_GLOO" OFF) cmake_dependent_option( USE_C10D_NCCL "USE C10D NCCL" ON "USE_DISTRIBUTED;USE_NCCL" OFF) cmake_dependent_option( +<<<<<<< HEAD +======= + USE_C10D_XCCL "USE C10D XCCL" ON "USE_DISTRIBUTED;USE_XCCL" OFF) +cmake_dependent_option( +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) USE_C10D_MPI "USE C10D MPI" ON "USE_DISTRIBUTED;USE_MPI" OFF) cmake_dependent_option( USE_TENSORPIPE "Use TensorPipe. Only available if USE_DISTRIBUTED is on." ON @@ -531,7 +576,10 @@ if(USE_LIGHTWEIGHT_DISPATCH AND NOT STATIC_DISPATCH_BACKEND) endif() option(TRACING_BASED "Master flag to build Lite Interpreter with tracing build option" OFF) +<<<<<<< HEAD option(BUILD_EXECUTORCH "Master flag to build Executorch" ON) +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # This is a fix for a rare build issue on Ubuntu: symbol lookup error: # miniconda3/envs/pytorch-py3.7/lib/libmkl_intel_lp64.so: undefined symbol: # mkl_blas_dsyrk @@ -563,6 +611,13 @@ if(MSVC) set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler /Zc:__cplusplus") set(CMAKE_NINJA_CMCLDEPS_RC OFF) +<<<<<<< HEAD +======= + if(MSVC_Z7_OVERRIDE) + # CMake set debug flags to use /Z7 + set(CMAKE_MSVC_DEBUG_INFORMATION_FORMAT Embedded) + endif() +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) foreach( flag_var CMAKE_C_FLAGS @@ -575,12 +630,15 @@ if(MSVC) CMAKE_CXX_FLAGS_RELEASE CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO) +<<<<<<< HEAD # Replace /Zi and /ZI with /Z7 if(MSVC_Z7_OVERRIDE) if(${flag_var} MATCHES "/Z[iI]") string(REGEX REPLACE "/Z[iI]" "/Z7" ${flag_var} "${${flag_var}}") endif(${flag_var} MATCHES "/Z[iI]") endif(MSVC_Z7_OVERRIDE) +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if(${CAFFE2_USE_MSVC_STATIC_RUNTIME}) if(${flag_var} MATCHES "/MD") @@ -703,7 +761,11 @@ endif() if(USE_KLEIDIAI AND CMAKE_C_COMPILER_VERSION) if(CMAKE_C_COMPILER_VERSION VERSION_LESS 11) set(USE_KLEIDIAI OFF) +<<<<<<< HEAD message(WARNING "Disabling KleidiAI: Requires atleast GCC 11 or Clang 11") +======= + message(WARNING "Disabling KleidiAI: Requires at least GCC 11 or Clang 11") +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) endif() endif() @@ -872,7 +934,11 @@ cmake_dependent_option( "Whether to build the flash_attention kernel for scaled dot product attention.\ Will be disabled if not supported by the platform" ON +<<<<<<< HEAD "USE_CUDA OR USE_ROCM;NOT MSVC" +======= + "(USE_CUDA AND NOT MSVC) OR USE_ROCM" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) OFF) # CAVEAT: Again, Flash Attention2 will error while building for sm52 while Mem @@ -888,7 +954,11 @@ cmake_dependent_option( # USE_FLASH_ATTENTION -> USE_ROCM -> Dependencies.cmake -> aotriton.cmake # if(USE_ROCM) +<<<<<<< HEAD if(UNIX AND (USE_FLASH_ATTENTION OR USE_MEM_EFF_ATTENTION)) +======= + if(USE_FLASH_ATTENTION OR USE_MEM_EFF_ATTENTION) +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) include(cmake/External/aotriton.cmake) endif() endif() @@ -988,8 +1058,22 @@ endif() # ---[ Build flags Re-include to override append_cxx_flag_if_supported from # third_party/FBGEMM include(cmake/public/utils.cmake) +<<<<<<< HEAD if(NOT MSVC) string(APPEND CMAKE_CXX_FLAGS " -O2 -fPIC") +======= +if(USE_COLORIZE_OUTPUT) + set(CMAKE_COLOR_DIAGNOSTICS ON) +endif() +if(NOT MSVC) + string(APPEND CMAKE_CXX_FLAGS " -O2 -fPIC") + + # This prevents use of `c10::optional`, `c10::nullopt` etc within the codebase + string(APPEND CMAKE_CXX_FLAGS " -DC10_NODEPRECATED") + string(APPEND CMAKE_CUDA_FLAGS " -DC10_NODEPRECATED") + string(APPEND CMAKE_OBJCXX_FLAGS " -DC10_NODEPRECATED") + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # Eigen fails to build with some versions, so convert this to a warning # Details at http://eigen.tuxfamily.org/bz/show_bug.cgi?id=1459 string(APPEND CMAKE_CXX_FLAGS " -Wall") @@ -1057,6 +1141,7 @@ if(NOT MSVC) CMAKE_CXX_FLAGS) append_cxx_flag_if_supported("-Qunused-arguments" CMAKE_CXX_FLAGS) +<<<<<<< HEAD if(${USE_COLORIZE_OUTPUT}) # Why compiler checks are necessary even when `try_compile` is used Because # of the bug in ccache that can incorrectly identify `-fcolor-diagnostics` @@ -1070,6 +1155,8 @@ if(NOT MSVC) endif() endif() +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) append_cxx_flag_if_supported("-faligned-new" CMAKE_CXX_FLAGS) if(WERROR) @@ -1096,7 +1183,10 @@ if(NOT MSVC) if(CMAKE_COMPILER_IS_GNUCXX AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 13) append_cxx_flag_if_supported("-Wno-dangling-reference" CMAKE_CXX_FLAGS) append_cxx_flag_if_supported("-Wno-error=dangling-reference" CMAKE_CXX_FLAGS) +<<<<<<< HEAD append_cxx_flag_if_supported("-Wno-error=redundant-move" CMAKE_CXX_FLAGS) +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) endif() else() # Define export functions for AOTI. @@ -1256,7 +1346,11 @@ endif() add_subdirectory(c10) add_subdirectory(caffe2) +<<<<<<< HEAD # ---[ CMake related files Uninistall option. +======= +# ---[ CMake related files Uninstall option. +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if(NOT TARGET caffe2_uninstall) configure_file( ${CMAKE_CURRENT_SOURCE_DIR}/cmake/cmake_uninstall.cmake.in diff --git a/CODEOWNERS b/CODEOWNERS index ed5edc0abbb4..3b08a825a6b6 100644 --- a/CODEOWNERS +++ b/CODEOWNERS @@ -7,13 +7,21 @@ # Each line is a file pattern followed by one or more owners. # For module labels => owners mapping, please see https://github.com/pytorch/pytorch/issues/24422. +<<<<<<< HEAD /torch/utils/cpp_extension.py @fmassa @soumith @ezyang +======= +/torch/utils/cpp_extension.py @fmassa @ezyang @malfet +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # Not there to strictly require the approval, but to be tagged as a reviewer # on the PRs to push them into a high priority inbox. /torch/csrc/autograd/ @albanD @soulitzer /torch/autograd/ @albanD @soulitzer /tools/autograd/ @albanD @soulitzer +<<<<<<< HEAD +======= +/torch/header_only_apis.txt @janeyx99 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) /torch/nn/ @albanD @jbschlosser @mikaylagawarecki /torch/optim/ @albanD @janeyx99 /test/test_public_bindings.py @albanD @@ -21,6 +29,10 @@ /test/forward_backward_compatibility/check_forward_backward_compatibility.py @larryliu0820 /docs/source/conf.py @albanD /aten/src/ATen/native/tags.yaml @ezyang +<<<<<<< HEAD +======= +/.github/merge_rules.yaml @albanD @malfet +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # Architecture Optimization (quantization, sparsity, etc.) /aten/src/ATen/native/ao_sparse @salilsdesai @kimishpatel @digantdesai @jianyuh @@ -49,12 +61,21 @@ nn/qat/ @jerryzh168 /torch/csrc/distributed/c10d/Ops.* @kwen2501 # ONNX Export +<<<<<<< HEAD /torch/_dynamo/backends/onnxrt.py @wschin @xadupre /torch/csrc/jit/passes/onnx.h @titaiwangms @shubhambhokare1 @xadupre /torch/csrc/jit/passes/onnx.cpp @titaiwangms @shubhambhokare1 @xadupre /torch/csrc/jit/passes/onnx/ @titaiwangms @shubhambhokare1 @xadupre /torch/onnx/ @titaiwangms @shubhambhokare1 @justinchuby @wschin @xadupre /test/onnx/ @titaiwangms @shubhambhokare1 @justinchuby @wschin @xadupre +======= +/torch/_dynamo/backends/onnxrt.py @wschin +/torch/csrc/jit/passes/onnx.h @titaiwangms @shubhambhokare1 +/torch/csrc/jit/passes/onnx.cpp @titaiwangms @shubhambhokare1 +/torch/csrc/jit/passes/onnx/ @titaiwangms @shubhambhokare1 +/torch/onnx/ @titaiwangms @shubhambhokare1 @justinchuby @wschin +/test/onnx/ @titaiwangms @shubhambhokare1 @justinchuby @wschin +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # CI /.ci @pytorch/pytorch-dev-infra @@ -134,7 +155,11 @@ torch/profiler/ @sraikund16 test/functorch/test_aotdispatch.py @ezyang @Chillee # Dataloader +<<<<<<< HEAD torch/utils/data/ @andrewkho @divyanshk +======= +torch/utils/data/ @divyanshk @ramanishsingh +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # hipify torch/utils/hipify/ @jeffdaily @jithunnair-amd @@ -164,6 +189,14 @@ caffe2/utils/hip @jeffdaily @jithunnair-amd /torch/export/ @avikchaudhuri @tugsbayasgalan @zhxchen17 @ydwu4 @angelayi /torch/_export/ @avikchaudhuri @tugsbayasgalan @zhxchen17 @ydwu4 @angelayi +<<<<<<< HEAD +======= +# Dynamic Shapes +/torch/fx/experimental/symbolic_shapes.py @bobrenjc93 @laithsakka +/torch/fx/experimental/sym_node.py @bobrenjc93 @laithsakka +/torch/fx/experimental/recording.py @bobrenjc93 @laithsakka + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # serialization-related files /aten/src/ATen/MapAllocator* @mikaylagawarecki /caffe2/serialize/ @mikaylagawarecki diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index e48eee1889eb..3c591eb9c8f0 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -112,8 +112,12 @@ source venv/bin/activate # or `& .\venv\Scripts\Activate.ps1` on Windows lazy.) ```bash +<<<<<<< HEAD conda uninstall pytorch -y yes | pip uninstall torch +======= + pip uninstall torch +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ``` Next run `python setup.py clean`. After that, you can install in `develop` mode again. @@ -130,8 +134,13 @@ source venv/bin/activate # or `& .\venv\Scripts\Activate.ps1` on Windows git submodule deinit -f . git clean -xdf python setup.py clean +<<<<<<< HEAD git submodule update --init --recursive # very important to sync the submodules python setup.py develop # then try running the command again +======= + git submodule update --init --recursive + python setup.py develop +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ``` 4. The main step within `python setup.py develop` is running `make` from the `build` directory. If you want to experiment with some environment variables, you can pass them into the command: @@ -149,7 +158,11 @@ source venv/bin/activate # or `& .\venv\Scripts\Activate.ps1` on Windows - If you encounter an error such as ``` +<<<<<<< HEAD fatal: unable to access 'https://github.com/pybind11/pybind11.git': could not load PEM client certificate ... +======= + fatal: unable to access 'https://github.com/pybind/pybind11.git': could not load PEM client certificate ... +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ``` this is likely that you are using HTTP proxying and the certificate expired. To check if the certificate is valid, run `git config --global --list` and search for config like `http.proxysslcert=`. Then check certificate valid date by running @@ -180,6 +193,7 @@ You can use this script to check out a new nightly branch with the following: source venv/bin/activate # or `& .\venv\Scripts\Activate.ps1` on Windows ``` +<<<<<<< HEAD Or if you would like to re-use an existing conda environment, you can pass in the prefix argument (`--prefix`): @@ -188,6 +202,8 @@ the prefix argument (`--prefix`): source my-env/bin/activate # or `& .\my-env\Scripts\Activate.ps1` on Windows ``` +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) To install the nightly binaries built with CUDA, you can pass in the flag `--cuda`: ```bash @@ -237,6 +253,11 @@ dependencies as well as the nightly binaries into the repo directory. details. * [cuda](aten/src/ATen/native/cuda) - CUDA implementations of operators. +<<<<<<< HEAD +======= + * [mps](aten/src/ATen/native/mps) - MPS implementations of + operators for Apple's Metal GPU family. +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) * [sparse](aten/src/ATen/native/sparse) - CPU and CUDA implementations of COO sparse tensor operations * [mkl](aten/src/ATen/native/mkl) [mkldnn](aten/src/ATen/native/mkldnn) @@ -281,8 +302,11 @@ dependencies as well as the nightly binaries into the repo directory. * [caffe2](caffe2) - The Caffe2 library. * [core](caffe2/core) - Core files of Caffe2, e.g., tensor, workspace, blobs, etc. +<<<<<<< HEAD * [operators](caffe2/operators) - Operators of Caffe2. * [python](caffe2/python) - Python bindings to Caffe2. +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) * ... * [.circleci](.circleci) - CircleCI configuration management. [README](.circleci/README.md) @@ -291,7 +315,11 @@ dependencies as well as the nightly binaries into the repo directory. ### Python Unit Testing **Prerequisites**: +<<<<<<< HEAD The following packages should be installed with either `conda` or `pip`: +======= +The following packages should be installed with `pip`: +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - `expecttest` and `hypothesis` - required to run tests - `mypy` - recommended for linting - `pytest` - recommended to run tests more selectively @@ -354,6 +382,7 @@ command runs tests such as `TestNN.test_BCELoss` and ### Local linting +<<<<<<< HEAD Install all prerequisites by running ```bash @@ -361,6 +390,9 @@ make setup-lint ``` You can now run the same linting steps that are used in CI locally via `make`: +======= +You can run the same linting steps that are used in CI locally via `make`: +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ```bash make lint @@ -436,7 +468,11 @@ PyTorch has two main types of documentation: These are the docs that you see over at [our docs website](https://pytorch.org/docs). - **Developer facing documentation**: Developer facing documentation is spread around our READMEs in our codebase and in +<<<<<<< HEAD the [PyTorch Developer Wiki](https://pytorch.org/wiki). +======= +the [PyTorch Developer Wiki](https://github.com/pytorch/pytorch/wiki). +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) If you're interested in adding new developer docs, please read this [page on the wiki](https://github.com/pytorch/pytorch/wiki/Where-or-how-should-I-add-documentation) on our best practices for where to put it. The rest of this section is about user-facing documentation. @@ -484,7 +520,11 @@ In addition to the standard Google Style docstring formatting rules, the followi ### Building documentation +<<<<<<< HEAD To build the documentation: +======= +Note that the docs will only build with Python versions <3.13. To build the documentation: +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) 1. Build and install PyTorch @@ -499,8 +539,12 @@ pip install -r requirements.txt # Or if you prefer an uncontaminated global executable environment or do not want to go through the node configuration: # npm install katex && export PATH="$PATH:$(pwd)/node_modules/.bin" ``` +<<<<<<< HEAD > Note: if you installed `nodejs` with a different package manager (e.g., `conda`) then `npm` will probably install a version of `katex` that is not +======= +> Note: if you installed `nodejs` with a different package manager then `npm` will probably install a version of `katex` that is not +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) compatible with your version of `nodejs` and doc builds will fail. A combination of versions that is known to work is `node@6.13.1` and `katex@0.13.18`. To install the latter with `npm` you can run @@ -595,9 +639,14 @@ rsync -az me@my_machine:/path/to/pytorch/docs/cpp/build/html cpp/build ### Previewing documentation on PRs +<<<<<<< HEAD PyTorch will host documentation previews at `https://docs-preview.pytorch.org/pytorch/pytorch//index.html` once the `pytorch_python_doc_build` GitHub Actions job has completed on your PR. You can visit that page directly or find its link in the automated Dr. CI comment on your PR. +======= +PyTorch will host documentation previews at `https://docs-preview.pytorch.org/pytorch/pytorch//index.html` once the docs GitHub Actions job has completed on your PR. You can find its link in the automated pytorchbot comment on your PR or go to the URL +directly. +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ### Adding documentation tests @@ -669,16 +718,28 @@ details. One downside to using `python setup.py develop` is that your development version of PyTorch will be installed globally on your account (e.g., if you run `import torch` anywhere else, the development version will be +<<<<<<< HEAD used. If you want to manage multiple builds of PyTorch, you can make use of [conda environments](https://conda.io/docs/using/envs.html) to maintain +======= +used). + +If you want to manage multiple builds of PyTorch, you can make use of +[venv environments](https://docs.python.org/3/library/venv.html) to maintain +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) separate Python package environments, each of which can be tied to a specific build of PyTorch. To set one up: ```bash +<<<<<<< HEAD conda create -n pytorch-myfeature source activate pytorch-myfeature +======= +python -m venv pytorch-myfeature +source pytorch-myfeature/bin/activate # or `& .\pytorch-myfeature\Scripts\Activate.ps1` on Windows +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # if you run python now, torch will NOT be installed python setup.py develop ``` @@ -756,7 +817,10 @@ same. Using ccache in a situation like this is a real time-saver. Before building pytorch, install ccache from your package manager of choice: ```bash +<<<<<<< HEAD conda install ccache -c conda-forge +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) sudo apt install ccache sudo yum install ccache brew install ccache @@ -990,7 +1054,11 @@ If you are working on the CUDA code, here are some useful CUDA debugging tips: 3. CUDA supports a lot of C++11/14 features such as, `std::numeric_limits`, `std::nextafter`, `std::tuple` etc. in device code. Many of such features are possible because of the [--expt-relaxed-constexpr](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#constexpr-functions) +<<<<<<< HEAD nvcc flag. There is a known [issue](https://github.com/ROCm-Developer-Tools/HIP/issues/374) +======= + nvcc flag. There is a known [issue](https://github.com/ROCm/hip/issues/374) +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) that ROCm errors out on device code, which uses such stl functions. 4. A good performance metric for a CUDA kernel is the [Effective Memory Bandwidth](https://devblogs.nvidia.com/how-implement-performance-metrics-cuda-cc/). @@ -1048,8 +1116,12 @@ than Linux, which are worth keeping in mind when fixing these problems. 3. If you have a Windows box (we have a few on EC2 which you can request access to) and you want to run the build, the easiest way is to just run `.ci/pytorch/win-build.sh`. +<<<<<<< HEAD If you need to rebuild, run `REBUILD=1 .ci/pytorch/win-build.sh` (this will avoid blowing away your Conda environment.) +======= + If you need to rebuild, run `REBUILD=1 .ci/pytorch/win-build.sh`. +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) Even if you don't know anything about MSVC, you can use cmake to build simple programs on Windows; this can be helpful if you want to learn more about some peculiar linking behavior @@ -1137,7 +1209,11 @@ CUDA, MSVC, and PyTorch versions are interdependent; please install matching ver | 10.2 | Visual Studio 2019 (16.X) (`_MSC_VER` < 1930) | 1.5.0 ~ 1.7.0 | | 11.0 | Visual Studio 2019 (16.X) (`_MSC_VER` < 1930) | 1.7.0 | +<<<<<<< HEAD Note: There's a [compilation issue](https://github.com/oneapi-src/oneDNN/issues/812) in several Visual Studio 2019 versions since 16.7.1, so please make sure your Visual Studio 2019 version is not in 16.7.1 ~ 16.7.5 +======= +Note: There's a [compilation issue](https://github.com/uxlfoundation/oneDNN/issues/812) in several Visual Studio 2019 versions since 16.7.1, so please make sure your Visual Studio 2019 version is not in 16.7.1 ~ 16.7.5 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ## Pre-commit tidy/linting hook @@ -1266,7 +1342,11 @@ in the meantime there will be some separation. There are a few "unusual" directories which, for historical reasons, are Caffe2/PyTorch specific. Here they are: +<<<<<<< HEAD - `CMakeLists.txt`, `Makefile`, `binaries`, `cmake`, `conda`, `modules`, +======= +- `CMakeLists.txt`, `Makefile`, `binaries`, `cmake`, `modules`, +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) `scripts` are Caffe2-specific. Don't put PyTorch code in them without extra coordination. diff --git a/Dockerfile b/Dockerfile index 5cec2173063b..0c66b38c9f05 100644 --- a/Dockerfile +++ b/Dockerfile @@ -70,7 +70,11 @@ RUN /opt/conda/bin/conda install -y python=${PYTHON_VERSION} ARG TARGETPLATFORM +<<<<<<< HEAD # INSTALL_CHANNEL whl - release, whl/nightly - nightly, whle/test - test channels +======= +# INSTALL_CHANNEL whl - release, whl/nightly - nightly, whl/test - test channels +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) RUN case ${TARGETPLATFORM} in \ "linux/arm64") pip install --extra-index-url https://download.pytorch.org/whl/cpu/ torch torchvision torchaudio ;; \ *) pip install --index-url https://download.pytorch.org/${INSTALL_CHANNEL}/${CUDA_PATH#.}/ torch torchvision torchaudio ;; \ diff --git a/Makefile b/Makefile index e5b4386b5dd2..d2c66d31b914 100644 --- a/Makefile +++ b/Makefile @@ -1,4 +1,5 @@ # This makefile does nothing but delegating the actual building to cmake. +<<<<<<< HEAD PYTHON = python3 PIP = $(PYTHON) -m pip NIGHTLY_TOOL_OPTS := pull @@ -18,17 +19,58 @@ ios: clean: # This will remove ALL build folders. @rm -r build*/ +======= + +SHELL = /bin/bash +.SHELLFLAGS := -eu -o pipefail -c +PYTHON ?= $(shell command -v python3 || command -v python) +PIP = $(PYTHON) -m pip +NIGHTLY_TOOL_OPTS := pull + +.PHONY: all +all: + @cmake -S . -B build $(shell $(PYTHON) ./scripts/get_python_cmake_flags.py) && \ + cmake --build build --parallel -- + +.PHONY: local +local: + @./scripts/build_local.sh + +.PHONY: android +android: + @./scripts/build_android.sh + +.PHONY: ios +ios: + @./scripts/build_ios.sh + +.PHONY: triton +triton: + $(PIP) uninstall -y triton + @./scripts/install_triton_wheel.sh + +.PHONY: clean +clean: # This will remove ALL build folders. + @rm -r build*/ || true + +.PHONY: linecount +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) linecount: @cloc --read-lang-def=caffe.cloc caffe2 || \ echo "Cloc is not available on the machine. You can install cloc with " && \ echo " sudo apt-get install cloc" +<<<<<<< HEAD +======= +.PHONY: ensure-branch-clean +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ensure-branch-clean: @if [ -n "$(shell git status --porcelain)" ]; then \ echo "Please commit or stash all changes before running this script"; \ exit 1; \ fi +<<<<<<< HEAD setup-env: ensure-branch-clean $(PYTHON) tools/nightly.py $(NIGHTLY_TOOL_OPTS) @@ -57,3 +99,52 @@ quicklint: triton: $(PIP) uninstall -y triton @./scripts/install_triton_wheel.sh +======= +.PHONY: setup-env +setup-env: ensure-branch-clean + $(PYTHON) tools/nightly.py $(NIGHTLY_TOOL_OPTS) + +.PHONY: setup-env-cuda +setup-env-cuda: + $(MAKE) setup-env PYTHON="$(PYTHON)" NIGHTLY_TOOL_OPTS="$(NIGHTLY_TOOL_OPTS) --cuda" + +.PHONY: setup-env-rocm +setup-env-rocm: + $(MAKE) setup-env PYTHON="$(PYTHON)" NIGHTLY_TOOL_OPTS="$(NIGHTLY_TOOL_OPTS) --rocm" + +.lintbin/.lintrunner.sha256: requirements.txt pyproject.toml .lintrunner.toml + @echo "Setting up lintrunner..." + $(PIP) install lintrunner + lintrunner init + @echo "Generating .lintrunner.sha256..." + @mkdir -p .lintbin + @sha256sum requirements.txt pyproject.toml .lintrunner.toml > .lintbin/.lintrunner.sha256 + +.PHONY: setup-lint +setup-lint: .lintbin/.lintrunner.sha256 + +.PHONY: lazy-setup-lint +lazy-setup-lint: .lintbin/.lintrunner.sha256 + @if [ ! -x "$(shell command -v lintrunner)" ]; then \ + $(MAKE) setup-lint; \ + fi + +.PHONY: lint +lint: lazy-setup-lint + lintrunner --all-files + +.PHONY: quicklint +quicklint: lazy-setup-lint + lintrunner + +.PHONY: quickfix +quickfix: lazy-setup-lint + lintrunner --apply-patches + +# Deprecated target aliases +.PHONY: setup_env setup_env_cuda setup_env_rocm setup_lint +setup_env: setup-env +setup_env_cuda: setup-env-cuda +setup_env_rocm: setup-env-rocm +setup_lint: setup-lint +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) diff --git a/README.md b/README.md index eccd24e16cf4..f013dbe862a5 100644 --- a/README.md +++ b/README.md @@ -35,6 +35,10 @@ Our trunk health (Continuous Integration signals) can be found at [hud.pytorch.o - [Using pre-built images](#using-pre-built-images) - [Building the image yourself](#building-the-image-yourself) - [Building the Documentation](#building-the-documentation) +<<<<<<< HEAD +======= + - [Building a PDF](#building-a-pdf) +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - [Previous Versions](#previous-versions) - [Getting Started](#getting-started) - [Resources](#resources) @@ -169,8 +173,11 @@ Professional, or Community Editions. You can also install the build tools from https://visualstudio.microsoft.com/visual-cpp-build-tools/. The build tools *do not* come with Visual Studio Code by default. +<<<<<<< HEAD \* We highly recommend installing an [Anaconda](https://www.anaconda.com/download) environment. You will get a high-quality BLAS library (MKL) and you get controlled dependency versions regardless of your Linux distro. +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) An example of environment setup is shown below: * Linux: @@ -190,16 +197,33 @@ $ conda activate $ call "C:\Program Files\Microsoft Visual Studio\\Community\VC\Auxiliary\Build\vcvarsall.bat" x64 ``` +<<<<<<< HEAD +======= +A conda environment is not required. You can also do a PyTorch build in a +standard virtual environment, e.g., created with tools like `uv`, provided +your system has installed all the necessary dependencies unavailable as pip +packages (e.g., CUDA, MKL.) + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ##### NVIDIA CUDA Support If you want to compile with CUDA support, [select a supported version of CUDA from our support matrix](https://pytorch.org/get-started/locally/), then install the following: - [NVIDIA CUDA](https://developer.nvidia.com/cuda-downloads) - [NVIDIA cuDNN](https://developer.nvidia.com/cudnn) v8.5 or above - [Compiler](https://gist.github.com/ax3l/9489132) compatible with CUDA +<<<<<<< HEAD Note: You could refer to the [cuDNN Support Matrix](https://docs.nvidia.com/deeplearning/cudnn/reference/support-matrix.html) for cuDNN versions with the various supported CUDA, CUDA driver and NVIDIA hardware If you want to disable CUDA support, export the environment variable `USE_CUDA=0`. Other potentially useful environment variables may be found in `setup.py`. +======= +Note: You could refer to the [cuDNN Support Matrix](https://docs.nvidia.com/deeplearning/cudnn/backend/latest/reference/support-matrix.html) for cuDNN versions with the various supported CUDA, CUDA driver and NVIDIA hardware + +If you want to disable CUDA support, export the environment variable `USE_CUDA=0`. +Other potentially useful environment variables may be found in `setup.py`. If +CUDA is installed in a non-standard location, set PATH so that the nvcc you +want to use can be found (e.g., `export PATH=/usr/local/cuda-12.8/bin:$PATH`). +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) If you are building for NVIDIA's Jetson platforms (Jetson Nano, TX1, TX2, AGX Xavier), Instructions to install PyTorch for Jetson Nano are [available here](https://devtalk.nvidia.com/default/topic/1049071/jetson-nano/pytorch-for-jetson-nano/) @@ -223,7 +247,11 @@ Other potentially useful environment variables may be found in `setup.py`. #### Get the PyTorch Source ```bash +<<<<<<< HEAD git clone --recursive https://github.com/pytorch/pytorch +======= +git clone https://github.com/pytorch/pytorch +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) cd pytorch # if you are updating an existing checkout git submodule sync @@ -245,7 +273,12 @@ pip install -r requirements.txt ```bash pip install mkl-static mkl-include # CUDA only: Add LAPACK support for the GPU if needed +<<<<<<< HEAD conda install -c pytorch magma-cuda121 # or the magma-cuda* that matches your CUDA version from https://anaconda.org/pytorch/repo +======= +# magma installation: run with active conda environment. specify CUDA version to install +.ci/docker/common/install_magma_conda.sh 12.4 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # (optional) If using torch.compile with inductor/triton, install the matching version of triton # Run from the pytorch directory after cloning @@ -274,6 +307,7 @@ conda install -c conda-forge libuv=1.39 #### Install PyTorch **On Linux** +<<<<<<< HEAD If you would like to compile PyTorch with [new C++ ABI](https://gcc.gnu.org/onlinedocs/libstdc++/manual/using_dual_abi.html) enabled, then first run this command: ```bash export _GLIBCXX_USE_CXX11_ABI=1 @@ -281,6 +315,8 @@ export _GLIBCXX_USE_CXX11_ABI=1 Please **note** that starting from PyTorch 2.5, the PyTorch build with XPU supports both new and old C++ ABIs. Previously, XPU only supported the new C++ ABI. If you want to compile with Intel GPU support, please follow [Intel GPU Support](#intel-gpu-support). +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) If you're compiling for AMD ROCm then first run this command: ```bash # Only run this if you're compiling for ROCm @@ -384,14 +420,22 @@ with such a step. On Linux ```bash export CMAKE_PREFIX_PATH="${CONDA_PREFIX:-'$(dirname $(which conda))/../'}:${CMAKE_PREFIX_PATH}" +<<<<<<< HEAD python setup.py build --cmake-only +======= +CMAKE_ONLY=1 python setup.py build +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ccmake build # or cmake-gui build ``` On macOS ```bash export CMAKE_PREFIX_PATH="${CONDA_PREFIX:-'$(dirname $(which conda))/../'}:${CMAKE_PREFIX_PATH}" +<<<<<<< HEAD MACOSX_DEPLOYMENT_TARGET=10.9 CC=clang CXX=clang++ python setup.py build --cmake-only +======= +MACOSX_DEPLOYMENT_TARGET=10.9 CC=clang CXX=clang++ CMAKE_ONLY=1 python setup.py build +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ccmake build # or cmake-gui build ``` @@ -431,8 +475,22 @@ make -f docker.Makefile ### Building the Documentation +<<<<<<< HEAD To build documentation in various formats, you will need [Sphinx](http://www.sphinx-doc.org) and the readthedocs theme. +======= +To build documentation in various formats, you will need [Sphinx](http://www.sphinx-doc.org) +and the pytorch_sphinx_theme2. + +Before you build the documentation locally, ensure `torch` is +installed in your environment. For small fixes, you can install the +nightly version as described in [Getting Started](https://pytorch.org/get-started/locally/). + +For more complex fixes, such as adding a new module and docstrings for +the new module, you might need to install torch [from source](#from-source). +See [Docstring Guidelines](https://github.com/pytorch/pytorch/wiki/Docstring-Guidelines) +for docstring conventions. +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ```bash cd docs/ @@ -446,17 +504,74 @@ Run `make` to get a list of all available output formats. If you get a katex error run `npm install katex`. If it persists, try `npm install -g katex` +<<<<<<< HEAD > Note: if you installed `nodejs` with a different package manager (e.g., `conda`) then `npm` will probably install a version of `katex` that is not compatible with your version of `nodejs` and doc builds will fail. A combination of versions that is known to work is `node@6.13.1` and `katex@0.13.18`. To install the latter with `npm` you can run ```npm install -g katex@0.13.18``` +======= +> [!NOTE] +> If you installed `nodejs` with a different package manager (e.g., +> `conda`) then `npm` will probably install a version of `katex` that is not +> compatible with your version of `nodejs` and doc builds will fail. +> A combination of versions that is known to work is `node@6.13.1` and +> `katex@0.13.18`. To install the latter with `npm` you can run +> ```npm install -g katex@0.13.18``` + +> [!NOTE] +> If you see a numpy incompatibility error, run: +> ``` +> pip install 'numpy<2' +> ``` + +When you make changes to the dependencies run by CI, edit the +`.ci/docker/requirements-docs.txt` file. + +#### Building a PDF + +To compile a PDF of all PyTorch documentation, ensure you have +`texlive` and LaTeX installed. On macOS, you can install them using: + +``` +brew install --cask mactex +``` + +To create the PDF: + +1. Run: + + ``` + make latexpdf + ``` + + This will generate the necessary files in the `build/latex` directory. + +2. Navigate to this directory and execute: + + ``` + make LATEXOPTS="-interaction=nonstopmode" + ``` + + This will produce a `pytorch.pdf` with the desired content. Run this + command one more time so that it generates the correct table + of contents and index. + +> [!NOTE] +> To view the Table of Contents, switch to the **Table of Contents** +> view in your PDF viewer. + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ### Previous Versions Installation instructions and binaries for previous PyTorch versions may be found +<<<<<<< HEAD on [our website](https://pytorch.org/previous-versions). +======= +on [our website](https://pytorch.org/get-started/previous-versions). +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ## Getting Started @@ -504,7 +619,11 @@ To learn more about making a contribution to Pytorch, please see our [Contributi PyTorch is a community-driven project with several skillful engineers and researchers contributing to it. PyTorch is currently maintained by [Soumith Chintala](http://soumith.ch), [Gregory Chanan](https://github.com/gchanan), [Dmytro Dzhulgakov](https://github.com/dzhulgakov), [Edward Yang](https://github.com/ezyang), and [Nikita Shulga](https://github.com/malfet) with major contributions coming from hundreds of talented individuals in various forms and means. +<<<<<<< HEAD A non-exhaustive but growing list needs to mention: [Trevor Killeen](https://github.com/killeent), [Sasank Chilamkurthy](https://github.com/chsasank), [Sergey Zagoruyko](https://github.com/szagoruyko), [Adam Lerer](https://github.com/adamlerer), [Francisco Massa](https://github.com/fmassa), [Alykhan Tejani](https://github.com/alykhantejani), [Luca Antiga](https://github.com/lantiga), [Alban Desmaison](https://github.com/albanD), [Andreas Koepf](https://github.com/andreaskoepf), [James Bradbury](https://github.com/jamesb93), [Zeming Lin](https://github.com/ebetica), [Yuandong Tian](https://github.com/yuandong-tian), [Guillaume Lample](https://github.com/glample), [Marat Dukhan](https://github.com/Maratyszcza), [Natalia Gimelshein](https://github.com/ngimel), [Christian Sarofeen](https://github.com/csarofeen), [Martin Raison](https://github.com/martinraison), [Edward Yang](https://github.com/ezyang), [Zachary Devito](https://github.com/zdevito). +======= +A non-exhaustive but growing list needs to mention: [Trevor Killeen](https://github.com/killeent), [Sasank Chilamkurthy](https://github.com/chsasank), [Sergey Zagoruyko](https://github.com/szagoruyko), [Adam Lerer](https://github.com/adamlerer), [Francisco Massa](https://github.com/fmassa), [Alykhan Tejani](https://github.com/alykhantejani), [Luca Antiga](https://github.com/lantiga), [Alban Desmaison](https://github.com/albanD), [Andreas Koepf](https://github.com/andreaskoepf), [James Bradbury](https://github.com/jekbradbury), [Zeming Lin](https://github.com/ebetica), [Yuandong Tian](https://github.com/yuandong-tian), [Guillaume Lample](https://github.com/glample), [Marat Dukhan](https://github.com/Maratyszcza), [Natalia Gimelshein](https://github.com/ngimel), [Christian Sarofeen](https://github.com/csarofeen), [Martin Raison](https://github.com/martinraison), [Edward Yang](https://github.com/ezyang), [Zachary Devito](https://github.com/zdevito). +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) Note: This project is unrelated to [hughperkins/pytorch](https://github.com/hughperkins/pytorch) with the same name. Hugh is a valuable contributor to the Torch community and has helped with many things Torch and PyTorch. diff --git a/RELEASE.md b/RELEASE.md index 30b03b42435a..828559ea7962 100644 --- a/RELEASE.md +++ b/RELEASE.md @@ -19,7 +19,11 @@ - [Cherry Picking Fixes](#cherry-picking-fixes) - [How to do Cherry Picking](#how-to-do-cherry-picking) - [Cherry Picking Reverts](#cherry-picking-reverts) +<<<<<<< HEAD - [Preparing and Creating Final Release candidate](#preparing-and-creating-final-release-candidate) +======= + - [Preparing and Creating Final Release Candidate](#preparing-and-creating-final-release-candidate) +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - [Promoting RCs to Stable](#promoting-rcs-to-stable) - [Additional Steps to prepare for release day](#additional-steps-to-prepare-for-release-day) - [Modify release matrix](#modify-release-matrix) @@ -63,7 +67,11 @@ Following is the Release Compatibility Matrix for PyTorch releases: ## Release Cadence +<<<<<<< HEAD Following is the release cadence. All future dates below are tentative, for latest updates on the release scheduled please follow [dev discuss](https://dev-discuss.pytorch.org/c/release-announcements/27). Please note: Patch Releases are optional. +======= +Following is the release cadence. All future dates below are tentative. For latest updates on the release schedule, please follow [dev discuss](https://dev-discuss.pytorch.org/c/release-announcements/27). Please note: Patch Releases are optional. +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) | Minor Version | Release branch cut | Release date | First patch release date | Second patch release date| | --- | --- | --- | --- | --- | @@ -91,6 +99,7 @@ Releasing a new version of PyTorch generally entails 3 major steps: ### Frequently Asked Questions +<<<<<<< HEAD * Q: What is release branch cut ? * A: When bulk of the tracked features merged into the main branch, the primary release engineer starts the release process of cutting the release branch by creating a new git branch based off of the current `main` development branch of PyTorch. This allows PyTorch development flow on `main` to continue uninterrupted, while the release engineering team focuses on stabilizing the release branch in order to release a series of release candidates (RC). The activities in the release branch include both regression and performance testing as well as polishing new features and fixing release-specific bugs. In general, new features *are not* added to the release branch after it was created. @@ -105,6 +114,22 @@ Following Requirements needs to be met prior to cutting a release branch: ``` python github_analyze.py --repo-path ~/local/pytorch --remote upstream --branch release/1.11 --milestone-id 26 --missing-in-branch ``` * Validate that all new workflows have been created in the PyTorch and domain libraries included in the release. Validate it against all dimensions of release matrix, including operating systems(Linux, MacOS, Windows), Python versions as well as CPU architectures(x86 and arm) and accelerator versions(CUDA, ROCm, XPU). * All the nightly jobs for pytorch and domain libraries should be green. Validate this using following HUD links: +======= +* Q: What is a release branch cut ? + * A: When bulk of the tracked features merged into the main branch, the primary release engineer starts the release process of cutting the release branch by creating a new git branch based off of the current `main` development branch of PyTorch. This allows PyTorch development flow on `main` to continue uninterrupted, while the release engineering team focuses on stabilizing the release branch in order to release a series of release candidates (RC). The activities in the release branch include both regression and performance testing as well as polishing new features and fixing release-specific bugs. In general, new features *are not* added to the release branch after it was created. + +* Q: What is a cherry-pick ? + * A: A cherry pick is a process of propagating commits from the main into the release branch, utilizing git's built in [cherry-pick feature](https://git-scm.com/docs/git-cherry-pick). These commits are typically limited to small fixes or documentation updates to ensure that the release engineering team has sufficient time to complete a thorough round of testing on the release branch. To nominate a fix for cherry-picking, a separate pull request must be created against the respective release branch and then mentioned in the Release Tracker issue (example: https://github.com/pytorch/pytorch/issues/94937) following the template from the issue description. The comment nominating a particular cherry-pick for inclusion in the release should include the committed PR against main branch, the newly created cherry-pick PR, as well as the acceptance criteria for why the cherry-pick is needed in the first place. This process can be automated by using entering a comment `@pytorchbot cherry-pick -c [reason]` on the PR you wish to cherry-pick. + +## Cutting a release branch preparations + +Following requirements need to be met prior to cutting a release branch: + +* Resolve all outstanding issues in the milestones (for example [1.11.0](https://github.com/pytorch/pytorch/milestone/28)) before first RC cut is completed. After RC cut is completed, the following script should be executed from test-infra repo in order to validate the presence of the fixes in the release branch: +``` python github_analyze.py --repo-path ~/local/pytorch --remote upstream --branch release/1.11 --milestone-id 26 --missing-in-branch ``` +* Validate that all new workflows have been created in the PyTorch and domain libraries included in the release. Validate it against all dimensions of release matrix, including operating systems (Linux, MacOS, Windows), Python versions as well as CPU architectures (x86 and arm) and accelerator versions (CUDA, ROCm, XPU). +* All the nightly jobs for pytorch and domain libraries should be green. Validate this using the following HUD links: +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) * [Pytorch](https://hud.pytorch.org/hud/pytorch/pytorch/nightly) * [TorchVision](https://hud.pytorch.org/hud/pytorch/vision/nightly) * [TorchAudio](https://hud.pytorch.org/hud/pytorch/audio/nightly) @@ -132,7 +157,11 @@ This script should create 2 branches: ### PyTorch ecosystem libraries *Note*: Release branches for individual ecosystem libraries should be created after first release candidate build of PyTorch is available in staging channels (which happens about a week after PyTorch release branch has been created). This is absolutely required to allow sufficient testing time for each of the domain library. Domain libraries branch cut is performed by Ecosystem Library POC. +<<<<<<< HEAD Test-Infra branch cut should be performed at the same time as Pytorch core branch cut. Convenience script can also be used domains. +======= +Test-Infra branch cut should be performed at the same time as Pytorch core branch cut. Convenience script can also be used for domains. +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) > NOTE: RELEASE_VERSION only needs to be specified if version.txt is not available in root directory @@ -141,8 +170,16 @@ DRY_RUN=disabled GIT_BRANCH_TO_CUT_FROM=main RELEASE_VERSION=1.11 scripts/releas ``` ### Making release branch specific changes for PyTorch +<<<<<<< HEAD These are examples of changes that should be made to release branches so that CI / tooling can function normally on +======= +First you should cut a release branch for pytorch/test-infra: +* Create a new branch using the naming convention `release/[major].[minor]`, e.g. `release/2.7` +* On that release branch, update branch pointers for any pytorch-managed reusable actions or workflows to point to the new release's branch ([example](https://github.com/pytorch/test-infra/commit/749b9e36afa23298ad5498c9f5bcd96f5467baff#diff-d41015f3ac6cfa64b00e366bec416bb9487ac27493de7ebe7778fdfc7518b003R39)). + +Here are examples of changes that should be made to the pytorch/pytorch release branches so that CI / tooling can function normally on +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) them: * Update backwards compatibility tests to use RC binaries instead of nightlies @@ -163,8 +200,15 @@ Ecosystem libraries branch cut is done a few days after branch cut for the `pyto After the branch cut is performed, the Pytorch Dev Infra member should be informed of the branch cut and Domain Library specific change is required before Drafting RC for this domain library. Follow these examples of PR that updates the version and sets RC Candidate upload channel: +<<<<<<< HEAD * torchvision : https://github.com/pytorch/vision/pull/5400 * torchaudio: https://github.com/pytorch/audio/pull/2210 +======= +* torchvision : [Update version.txt](https://github.com/pytorch/vision/pull/8968) and [change workflow branch references](https://github.com/pytorch/vision/pull/8969) +* torchaudio: [Update version.txt](https://github.com/pytorch/audio/commit/654fee8fd17784271be1637eac1293fd834b4e9a) and [change workflow branch references](https://github.com/pytorch/audio/pull/3890) + +The CI workflow updating part of the above PRs can be automated by running: `python release/apply-release-changes.py [version]` (where version is something like '2.7'). That script lives in both pytorch/audio and pytorch/vision. +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ## Running Launch Execution team Core XFN sync @@ -207,9 +251,13 @@ git tag -f v1.12.0-rc2 git push origin v1.12.0-rc2 ``` +<<<<<<< HEAD Pushing a release candidate should trigger the `binary_builds` workflow within CircleCI using [`pytorch/pytorch-probot`](https://github.com/pytorch/pytorch-probot)'s [`trigger-circleci-workflows`](trigger-circleci-workflows) functionality. This trigger functionality is configured here: [`pytorch-circleci-labels.yml`](https://github.com/pytorch/pytorch/blob/main/.github/pytorch-circleci-labels.yml) +======= +Pushing a release candidate tag should trigger the `binary_build` workflows. This trigger functionality is configured in [`linux_binary_build_workflow.yml.j2]`][(https://github.com/pytorch/pytorch/blob/main/.github/pytorch-circleci-labels.yml](https://github.com/pytorch/pytorch/blob/main/.github/templates/linux_binary_build_workflow.yml.j2#L19-L22)) and in the matching templates for the other OSes. +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) To view the state of the release build, please navigate to [HUD](https://hud.pytorch.org/hud/pytorch/pytorch/release%2F1.12). And make sure all binary builds are successful. ### Release Candidate Storage @@ -218,18 +266,30 @@ Release candidates are currently stored in the following places: * Wheels: https://download.pytorch.org/whl/test/ * Conda: https://anaconda.org/pytorch-test +<<<<<<< HEAD * Libtorch: https://download.pytorch.org/libtorch/test +======= +* Libtorch: https://download.pytorch.org/libtorch/test +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) Backups are stored in a non-public S3 bucket at [`s3://pytorch-backup`](https://s3.console.aws.amazon.com/s3/buckets/pytorch-backup?region=us-east-1&tab=objects) ### Release Candidate health validation +<<<<<<< HEAD Validate the release jobs for pytorch and domain libraries should be green. Validate this using following HUD links: +======= +Validate that the release jobs for pytorch and domain libraries are green. Validate this using the following HUD links: +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) * [Pytorch](https://hud.pytorch.org/hud/pytorch/pytorch/release%2F1.12) * [TorchVision](https://hud.pytorch.org/hud/pytorch/vision/release%2F1.12) * [TorchAudio](https://hud.pytorch.org/hud/pytorch/audio/release%2F1.12) +<<<<<<< HEAD Validate that the documentation build has completed and generated entry corresponding to the release in [docs repository](https://github.com/pytorch/docs/tree/main/). +======= +Validate that the documentation build has completed and generated an entry corresponding to the release in the [docs repository](https://github.com/pytorch/docs/tree/main/). +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ### Cherry Picking Fixes @@ -274,6 +334,7 @@ requires `pytorchbot`, so it's only available in PyTorch atm. ### Cherry Picking Reverts +<<<<<<< HEAD If PR that has been cherry-picked into release branch has been reverted, its cherry-pick must be reverted as well. Reverts for changes that was committed into the main branch prior to the branch cut, must be propagated into release branch as well. @@ -283,6 +344,17 @@ Reverts for changes that was committed into the main branch prior to the branch The following requirements need to be met prior to creating final Release Candidate : * Resolve all outstanding open issues in the milestone. There should be no open issues/PRs (for example [2.1.2](https://github.com/pytorch/pytorch/milestone/39)). The issue should either be closed or de-milestoned. +======= +If a PR that has been cherry-picked into the release branch has been reverted, its cherry-pick must be reverted as well. + +Reverts for changes that were committed into the main branch prior to the branch cut must be propagated into the release branch as well. + +## Preparing and Creating Final Release Candidate + +The following requirements need to be met prior to creating the final Release Candidate: + +* Resolve all outstanding open issues in the milestone. There should be no open issues/PRs (for example [2.1.2](https://github.com/pytorch/pytorch/milestone/39)). Each issue should either be closed or de-milestoned. +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) * Validate that all closed milestone PRs are present in the release branch. Confirm this by running: ``` python github_analyze.py --repo-path ~/local/pytorch --remote upstream --branch release/2.2 --milestone-id 40 --missing-in-branch ``` @@ -291,7 +363,11 @@ The following requirements need to be met prior to creating final Release Candid * Perform [Release Candidate health validation](#release-candidate-health-validation). CI should have the green signal. +<<<<<<< HEAD After the final RC is created. The following tasks should be performed : +======= +After the final RC is created, the following tasks should be performed: +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) * Perform [Release Candidate health validation](#release-candidate-health-validation). CI should have the green signal. @@ -319,6 +395,7 @@ Promotion should occur in two steps: * Promote S3 artifacts (wheels, libtorch) and Conda packages * Promote S3 wheels to PyPI +<<<<<<< HEAD **NOTE**: The promotion of wheels to PyPI can only be done once so take caution when attempting to promote wheels to PyPI, (see https://github.com/pypa/warehouse/issues/726 for a discussion on potential draft releases within PyPI) ## Additional Steps to prepare for release day @@ -336,12 +413,35 @@ Please note: This PR needs to be merged on the release day and hence it should b ### Open Google Colab issue This is normally done right after the release is completed. We would need to create Google Colab Issue see following [PR](https://github.com/googlecolab/colabtools/issues/2372) +======= +**NOTE**: The promotion of wheels to PyPI can only be done once so take caution when attempting to promote wheels to PyPI, (see https://github.com/pypi/warehouse/issues/726 for a discussion on potential draft releases within PyPI) + +## Additional Steps to prepare for release day + +The following should be prepared for the release day: + +### Modify release matrix + +Modify the release matrix for the get started page. See the following [PR](https://github.com/pytorch/test-infra/pull/4611) as reference. + +The PR to update published_versions.json and quick-start-module.js is auto generated. See the following [PR](https://github.com/pytorch/pytorch.github.io/pull/1467) as reference. + +Please note: This PR needs to be merged on the release day and hence it should be absolutely free of any failures. To test this PR, open another test PR pointing to the Release Candidate location as described in the [Release Candidate Storage](#release-candidate-storage) section. + +### Open Google Colab issue + +This is normally done right after the release is completed. We need to create a Google Colab issue. See the following example [issue](https://github.com/googlecolab/colabtools/issues/2372) +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # Patch Releases A patch release is a maintenance release of PyTorch that includes fixes for regressions found in a previous minor release. Patch releases typically will bump the `patch` version from semver (i.e. `[major].[minor].[patch]`). +<<<<<<< HEAD Please note: Starting from 2.1 one can expect up to 2 patch releases after every minor ones. Patch releases would only be published for latest minor release. +======= +Please note: Starting from 2.1, one can expect up to 2 patch releases after every minor release. Patch releases are only published for the latest minor release. +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ## Patch Release Criteria @@ -363,6 +463,7 @@ Patch releases should be considered if a regression meets the following criteria > Main POC: Patch Release Managers, Triage Reviewers Patch releases should follow these high-level phases. This process starts immediately after the previous release has completed. +<<<<<<< HEAD Patch release process takes around 4-5 weeks to complete. 1. Triage, is a process where issues are identified, graded, compared to Patch Release Criteria and added to Patch Release milestone. This process normally takes 2 weeks after the release completion. @@ -372,20 +473,43 @@ Patch release process takes around 4-5 weeks to complete. 3. Cherry picking phase starts after the decision is made to create patch release. At this point a new release tracker for the patch release is created, and an announcement will be made on official channels [example announcement](https://dev-discuss.pytorch.org/t/pytorch-release-2-0-1-important-information/1176). The authors of the fixes to regressions will be asked to create their own cherry picks. This process normally takes 2 weeks. 4. Building Binaries, Promotion to Stable and testing. After all cherry picks have been merged, Release Managers trigger new build and produce new release candidate. Announcement is made on the official channel about the RC availability at this point. This process normally takes 2 weeks. 5. General Availability +======= +The patch release process takes around 4-5 weeks to complete. + +1. Triage is a process where issues are identified, graded, compared to Patch Release Criteria and added to Patch Release milestone. This process normally takes 2 weeks after the release completion. +2. Go/No Go meeting between PyTorch Releng, PyTorch Core and Project Managers where potential issues triggering a release in milestones are reviewed, and following decisions are made: + * Should the new patch release be created? + * Timeline execution for the patch release +3. Cherry picking phase starts after the decision is made to create a patch release. At this point, a new release tracker for the patch release is created, and an announcement will be made on official channels [example announcement](https://dev-discuss.pytorch.org/t/pytorch-release-2-0-1-important-information/1176). The authors of the fixes to regressions will be asked to create their own cherry picks. This process normally takes 2 weeks. +4. Updating `version.txt` in the release branch to match expected patch release version, see https://github.com/pytorch/pytorch/commit/f77213d3dae5d103a39cdaf93f21863843571e8d as an example +5. Building Binaries, Promotion to Stable and testing. After all cherry picks have been merged, Release Managers trigger a new build and produce a new release candidate. An announcement is made on the official channel about the RC availability at this point. This process normally takes 2 weeks. +6. General Availability +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ### Triage > Main POC: Triage Reviewers +<<<<<<< HEAD 1. Tag issues / pull requests that are candidates for a potential patch release with `triage review` * ![adding triage review label](https://user-images.githubusercontent.com/1700823/132589089-a9210a14-6159-409d-95e5-f79067f6fa38.png) 2. Triage reviewers will then check if the regression / fix identified fits within above mentioned [Patch Release Criteria](#patch-release-criteria) 3. Triage reviewers will then add the issue / pull request to the related milestone (i.e. `1.9.1`) if the regressions is found to be within the [Patch Release Criteria](#patch-release-criteria) +======= +1. Tag issues/pull requests that are candidates for a potential patch release with `triage review` + * ![adding triage review label](https://user-images.githubusercontent.com/1700823/132589089-a9210a14-6159-409d-95e5-f79067f6fa38.png) +2. Triage reviewers will then check if the regression/fix identified fits within the above mentioned [Patch Release Criteria](#patch-release-criteria) +3. Triage reviewers will then add the issue/pull request to the related milestone (i.e. `1.9.1`) if the regression is found to be within the [Patch Release Criteria](#patch-release-criteria) +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) * ![adding to milestone](https://user-images.githubusercontent.com/1700823/131175980-148ff38d-44c3-4611-8a1f-cd2fd1f4c49d.png) ### Issue Tracker for Patch releases +<<<<<<< HEAD For patch releases issue tracker needs to be created. For patch release, we require all cherry-pick changes to have links to either a high-priority GitHub issue or a CI failure from previous RC. An example of this would look like: +======= +For patch releases, an issue tracker needs to be created. For a patch release, we require all cherry-pick changes to have links to either a high-priority GitHub issue or a CI failure from previous RC. An example of this would look like: +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) * https://github.com/pytorch/pytorch/issues/128436 Only following issues are accepted: diff --git a/SECURITY.md b/SECURITY.md index 79514f2c282b..dfd67a86a76c 100644 --- a/SECURITY.md +++ b/SECURITY.md @@ -3,6 +3,10 @@ - [**Reporting a Vulnerability**](#reporting-a-vulnerability) - [**Using Pytorch Securely**](#using-pytorch-securely) - [Untrusted models](#untrusted-models) +<<<<<<< HEAD +======= + - [TorchScript models](#torchscript-models) +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - [Untrusted inputs](#untrusted-inputs) - [Data privacy](#data-privacy) - [Using distributed features](#using-distributed-features) @@ -38,6 +42,13 @@ Important Note: The trustworthiness of a model is not binary. You must always de https://arxiv.org/abs/2312.04748 https://arxiv.org/abs/2401.05566 +<<<<<<< HEAD +======= +### TorchScript models + +TorchScript models should treated the same way as locally executable code from an unknown source. Only run TorchScript models if you trust the provider. Please note, that tools for introspecting TorchScript models (such as `torch.utils.model_dump`) may also execute partial or full code stored in those models, therefore they should be used only if you trust the provider of the binary you are about to load. + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ### Untrusted inputs during training and prediction If you plan to open your model to untrusted inputs, be aware that inputs can also be used as vectors by malicious agents. To minimize risks, make sure to give your model only the permissions strictly required, and keep your libraries updated with the latest security patches. diff --git a/WORKSPACE b/WORKSPACE index ae7c0644e203..2fb9833df67a 100644 --- a/WORKSPACE +++ b/WORKSPACE @@ -144,8 +144,13 @@ new_local_repository( new_local_repository( name = "asmjit", +<<<<<<< HEAD build_file = "//third_party:fbgemm/third_party/asmjit.BUILD", path = "third_party/fbgemm/third_party/asmjit", +======= + build_file = "//third_party:fbgemm/external/asmjit.BUILD", + path = "third_party/fbgemm/external/asmjit", +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ) new_local_repository( @@ -185,6 +190,15 @@ new_local_repository( ) new_local_repository( +<<<<<<< HEAD +======= + name = "moodycamel", + build_file = "//third_party:moodycamel.BUILD", + path = "third_party/concurrentqueue", +) + +new_local_repository( +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) name = "tensorpipe", build_file = "//third_party:tensorpipe.BUILD", path = "third_party/tensorpipe", diff --git a/android/README.md b/android/README.md index d6a1ba1d4479..5b8e588dcd5f 100644 --- a/android/README.md +++ b/android/README.md @@ -2,7 +2,13 @@ ## Demo applications and tutorials +<<<<<<< HEAD Demo applications with code walk-through can be find in [this github repo](https://github.com/pytorch/android-demo-app). +======= +Please refer to [pytorch-labs/executorch-examples](https://github.com/pytorch-labs/executorch-examples/tree/main/dl3/android/DeepLabV3Demo) for the Android demo app based on [ExecuTorch](https://github.com/pytorch/executorch). + +Please join our [Discord](https://discord.com/channels/1334270993966825602/1349854760299270284) for any questions. +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ## Publishing @@ -119,8 +125,11 @@ We also have to add all transitive dependencies of our aars. As `pytorch_android` [depends](https://github.com/pytorch/pytorch/blob/master/android/pytorch_android/build.gradle#L76-L77) on `'com.facebook.soloader:nativeloader:0.10.5'` and `'com.facebook.fbjni:fbjni-java-only:0.2.2'`, we need to add them. (In case of using maven dependencies they are added automatically from `pom.xml`). +<<<<<<< HEAD You can check out [test app example](https://github.com/pytorch/pytorch/blob/master/android/test_app/app/build.gradle) that uses aars directly. +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ## Linking to prebuilt libtorch library from gradle dependency In some cases, you may want to use libtorch from your android native build. @@ -202,7 +211,11 @@ find_library(FBJNI_LIBRARY fbjni NO_CMAKE_FIND_ROOT_PATH) target_link_libraries(${PROJECT_NAME} +<<<<<<< HEAD ${PYTORCH_LIBRARY}) +======= + ${PYTORCH_LIBRARY} +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ${FBJNI_LIBRARY}) ``` @@ -233,8 +246,11 @@ void loadAndForwardModel(const std::string& modelPath) { To load torchscript model for mobile we need some special setup which is placed in `struct JITCallGuard` in this example. It may change in future, you can track the latest changes keeping an eye in our [pytorch android jni code]([https://github.com/pytorch/pytorch/blob/master/android/pytorch_android/src/main/cpp/pytorch_jni_jit.cpp#L28) +<<<<<<< HEAD [Example of linking to libtorch from aar](https://github.com/pytorch/pytorch/tree/master/android/test_app) +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ## PyTorch Android API Javadoc You can find more details about the PyTorch Android API in the [Javadoc](https://pytorch.org/javadoc/). diff --git a/android/pytorch_android/CMakeLists.txt b/android/pytorch_android/CMakeLists.txt index 0d46f87094ca..1a6de640a518 100644 --- a/android/pytorch_android/CMakeLists.txt +++ b/android/pytorch_android/CMakeLists.txt @@ -1,4 +1,8 @@ +<<<<<<< HEAD cmake_minimum_required(VERSION 3.4.1) +======= +cmake_minimum_required(VERSION 3.5) +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) option(BUILD_LITE_INTERPRETER "Master flag to build pytorch_jni_lite" ON) message( STATUS diff --git a/android/pytorch_android_torchvision/CMakeLists.txt b/android/pytorch_android_torchvision/CMakeLists.txt index 849e4d07cc1d..d001a2d2ab82 100644 --- a/android/pytorch_android_torchvision/CMakeLists.txt +++ b/android/pytorch_android_torchvision/CMakeLists.txt @@ -1,4 +1,8 @@ +<<<<<<< HEAD cmake_minimum_required(VERSION 3.4.1) +======= +cmake_minimum_required(VERSION 3.5) +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) project(pytorch_vision_jni CXX) set(CMAKE_CXX_STANDARD 17 CACHE STRING "The C++ standard whose features are requested to build this target.") set(CMAKE_VERBOSE_MAKEFILE ON) diff --git a/android/settings.gradle b/android/settings.gradle index 743f388b6507..5a291d7df9dc 100644 --- a/android/settings.gradle +++ b/android/settings.gradle @@ -3,4 +3,7 @@ include ':app', ':pytorch_android', ':pytorch_android_torchvision', ':pytorch_ho project(':pytorch_android_torchvision').projectDir = file('pytorch_android_torchvision') project(':pytorch_host').projectDir = file('pytorch_android/host') +<<<<<<< HEAD project(':test_app').projectDir = file('test_app/app') +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) diff --git a/aten/CMakeLists.txt b/aten/CMakeLists.txt index bda6aea32706..54facb45fd95 100644 --- a/aten/CMakeLists.txt +++ b/aten/CMakeLists.txt @@ -18,6 +18,10 @@ cmake_policy(SET CMP0012 NEW) ############################################# set(ATen_CPU_SRCS) +<<<<<<< HEAD +======= +set(ATen_MTIA_SRCS) +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) set(ATen_XPU_SRCS) set(ATen_XPU_INCLUDE) set(ATen_CPU_TEST_SRCS) @@ -101,6 +105,16 @@ else() set(AT_CUSPARSELT_ENABLED 1) endif() +<<<<<<< HEAD +======= +# Add hipSPARSELt support flag if the package is available. +if(USE_ROCM AND hipsparselt_FOUND) + set(AT_HIPSPARSELT_ENABLED 1) +else() + set(AT_HIPSPARSELT_ENABLED 0) +endif() + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) list(APPEND ATen_CPU_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/src) add_subdirectory(src/ATen) @@ -108,6 +122,10 @@ add_subdirectory(src/ATen) # Pass source, includes, and libs to parent set(ATen_CPU_SRCS ${ATen_CPU_SRCS} PARENT_SCOPE) set(ATen_CORE_SRCS ${ATen_CORE_SRCS} PARENT_SCOPE) +<<<<<<< HEAD +======= +set(ATen_MTIA_SRCS ${ATen_MTIA_SRCS} PARENT_SCOPE) +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) set(ATen_XPU_SRCS ${ATen_XPU_SRCS} PARENT_SCOPE) set(ATen_XPU_INCLUDE ${ATen_XPU_INCLUDE} PARENT_SCOPE) set(ATen_CUDA_CU_SRCS ${ATen_CUDA_CU_SRCS} PARENT_SCOPE) diff --git a/aten/src/ATen/CMakeLists.txt b/aten/src/ATen/CMakeLists.txt index 7ae6e39d7289..73be0e752477 100644 --- a/aten/src/ATen/CMakeLists.txt +++ b/aten/src/ATen/CMakeLists.txt @@ -1,4 +1,8 @@ +<<<<<<< HEAD cmake_minimum_required(VERSION 3.18 FATAL_ERROR) +======= +cmake_minimum_required(VERSION 3.27 FATAL_ERROR) +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) set(CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake ${CMAKE_MODULE_PATH}) if(NOT MSVC) @@ -34,6 +38,10 @@ set_bool(AT_MAGMA_ENABLED USE_MAGMA) set_bool(CAFFE2_STATIC_LINK_CUDA_INT CAFFE2_STATIC_LINK_CUDA) set_bool(AT_CUDNN_ENABLED CAFFE2_USE_CUDNN) set_bool(AT_CUSPARSELT_ENABLED CAFFE2_USE_CUSPARSELT) +<<<<<<< HEAD +======= +set_bool(AT_HIPSPARSELT_ENABLED CAFFE2_USE_HIPSPARSELT) +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) configure_file(Config.h.in "${CMAKE_CURRENT_SOURCE_DIR}/Config.h") # TODO: Do not generate CUDAConfig.h for ROCm BUILDS @@ -65,6 +73,15 @@ file(GLOB cudnn_h "cudnn/*.h" "cudnn/*.cuh") file(GLOB cudnn_cpp "cudnn/*.cpp") file(GLOB ops_h "ops/*.h") +<<<<<<< HEAD +======= +# MTIA +file(GLOB mtia_h "mtia/*.h" "mtia/detail/*.h") +file(GLOB mtia_cpp "mtia/*.cpp" "mtia/detail/*.cpp") +file(GLOB_RECURSE native_mtia_cpp "native/mtia/*.cpp") +file(GLOB_RECURSE native_mtia_h "native/mtia/*.h") + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) file(GLOB xpu_h "xpu/*.h" "xpu/detail/*.h") file(GLOB xpu_cpp "xpu/*.cpp" "xpu/detail/*.cpp") @@ -162,6 +179,7 @@ file(GLOB native_transformers_hip_hip "native/transformers/hip/*.hip") file(GLOB native_transformers_hip_cpp "native/transformers/hip/*.cpp") file(GLOB native_quantized_cudnn_hip_cpp "native/quantized/cudnn/hip/*.cpp") file(GLOB native_utils_cpp "native/utils/*.cpp") +<<<<<<< HEAD # flash_attention sources file(GLOB flash_attention_cuda_kernels_cu ${PROJECT_SOURCE_DIR}/third_party/flash-attention/csrc/flash_attn/src/*.cu) @@ -170,6 +188,12 @@ file(GLOB flash_attention_cuda_cpp "${PROJECT_SOURCE_DIR}/third_party/flash-attention/csrc/flash_attn/src/*.cpp" "native/transformers/cuda/flash_attn/flash_api.cpp" ) +======= +file(GLOB flash_attention_cuda_kernels_cu ${PROJECT_SOURCE_DIR}/third_party/flash-attention/csrc/flash_attn/src/*.cu) +file(GLOB flash_attention_cuda_cpp ${PROJECT_SOURCE_DIR}/third_party/flash-attention/csrc/flash_attn/src/*.cpp) +file(GLOB native_flash_attn_api_cpp "native/transformers/cuda/flash_attn/flash_api.cpp") + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # flash_attention hip sources file(GLOB flash_attention_hip_hip "native/transformers/hip/flash_attn/*.hip") @@ -201,10 +225,36 @@ file(GLOB mem_eff_attention_cuda_cu "native/transformers/cuda/mem_eff_attention/ file(GLOB mem_eff_attention_cuda_kernels_cu "native/transformers/cuda/mem_eff_attention/kernels/*.cu") file(GLOB mem_eff_attention_cuda_cpp "native/transformers/cuda/mem_eff_attention/*.cpp") +<<<<<<< HEAD if(USE_FLASH_ATTENTION) list(APPEND native_transformers_cuda_cu ${flash_attention_cuda_cu}) list(APPEND native_transformers_cuda_cu ${flash_attention_cuda_kernels_cu}) list(APPEND native_transformers_cuda_cpp ${flash_attention_cuda_cpp}) +======= +if(USE_CUDA AND (USE_FLASH_ATTENTION OR USE_MEM_EFF_ATTENTION)) + add_library(flash_attention OBJECT EXCLUDE_FROM_ALL ${flash_attention_cuda_kernels_cu} ${flash_attention_cuda_cpp}) + + target_include_directories(flash_attention PUBLIC + ${PROJECT_SOURCE_DIR}/third_party/flash-attention/csrc + ${PROJECT_SOURCE_DIR}/third_party/flash-attention/include + ${PROJECT_SOURCE_DIR}/third_party/cutlass/include + ${PROJECT_SOURCE_DIR}/third_party/flash-attention/csrc/flash_attn/src + ) + + target_compile_definitions(flash_attention PRIVATE + # Copied from https://github.com/pytorch/pytorch/blob/a10024d7dea47c52469059a47efe376eb20adca0/caffe2/CMakeLists.txt#L1431 + FLASH_NAMESPACE=pytorch_flash + FLASHATTENTION_DISABLE_ALIBI + FLASHATTENTION_DISABLE_SOFTCAP + UNFUSE_FMA + ) + + set_target_properties(flash_attention PROPERTIES POSITION_INDEPENDENT_CODE ON) +endif() + +if(USE_FLASH_ATTENTION) + list(APPEND native_transformers_cuda_cpp ${native_flash_attn_api_cpp}) +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) list(APPEND FLASH_ATTENTION_CUDA_SOURCES ${flash_attention_cuda_cu} ${flash_attention_cuda_kernels_cu}) list(APPEND ATen_ATTENTION_KERNEL_SRCS ${flash_attention_cuda_kernels_cu}) @@ -259,7 +309,10 @@ if(AT_MKL_ENABLED) endif() if(AT_KLEIDIAI_ENABLED) set(all_cpu_cpp ${all_cpu_cpp} ${native_kleidiai}) +<<<<<<< HEAD include_directories(SYSTEM INTERFACE ${KLEIDIAI_INCLUDE_DIRS}) +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) endif() if(AT_MKLDNN_ENABLED) set(all_cpu_cpp ${all_cpu_cpp} ${mkldnn_cpp}) @@ -270,6 +323,13 @@ else() set(all_cpu_cpp ${all_cpu_cpp} ${vulkan_cpp}) endif() +<<<<<<< HEAD +======= +if(USE_MTIA) + set(ATen_MTIA_SRCS ${ATen_MTIA_SRCS} ${mtia_cpp} ${mtia_h} ${native_mtia_cpp} ${native_mtia_h}) +endif() + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if(USE_XPU) list(APPEND ATen_XPU_SRCS ${mkldnn_xpu_cpp}) list(APPEND ATen_XPU_DEPENDENCY_LIBS xpu_mkldnn) @@ -384,12 +444,20 @@ endif() ${native_quantized_hip_hip} ${native_transformers_hip_hip} ${native_transformers_src_hip_hip} ) +<<<<<<< HEAD if(WIN32) # Windows doesn't support Composable Kernels and Triton file(GLOB native_hip_bgemm "native/hip/bgemm_kernels/*.hip") file(GLOB native_hip_ck "native/hip/ck*.hip") exclude(ATen_HIP_SRCS "${ATen_HIP_SRCS}" ${native_hip_bgemm} ${native_hip_ck} ${native_transformers_hip_hip} ${native_transformers_hip_cpp}) +======= + if(WIN32) # Windows doesn't support Composable Kernels + file(GLOB native_hip_bgemm "native/hip/bgemm_kernels/*.hip") + file(GLOB native_hip_ck "native/hip/ck*.hip") + exclude(ATen_HIP_SRCS "${ATen_HIP_SRCS}" + ${native_hip_bgemm} ${native_hip_ck}) +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) endif() # TODO: Codegen separate files for HIP and use those (s/cuda_generated_sources/hip_generated_sources) list(APPEND all_hip_cpp @@ -408,9 +476,12 @@ endif() ${miopen_cpp} ${all_hip_cpp} ) +<<<<<<< HEAD if(WIN32) # Windows doesn't support Triton exclude(all_hip_cpp "${all_hip_cpp}" ${native_transformers_hip_cpp}) endif() +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) endif() if(USE_XPU) @@ -422,6 +493,7 @@ endif() list(APPEND ATen_CPU_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/..) if(BLAS_FOUND) +<<<<<<< HEAD if($ENV{TH_BINARY_BUILD}) message(STATUS "TH_BINARY_BUILD detected. Enabling special linkage.") list(APPEND ATen_CPU_DEPENDENCY_LIBS @@ -429,6 +501,9 @@ if(BLAS_FOUND) else($ENV{TH_BINARY_BUILD}) list(APPEND ATen_CPU_DEPENDENCY_LIBS ${BLAS_LIBRARIES}) endif($ENV{TH_BINARY_BUILD}) +======= + list(APPEND ATen_CPU_DEPENDENCY_LIBS ${BLAS_LIBRARIES}) +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) endif(BLAS_FOUND) if(LAPACK_FOUND) @@ -614,8 +689,12 @@ endif() if($ENV{TH_BINARY_BUILD}) # Do not do this on Linux: see Note [Extra MKL symbols for MAGMA in torch_cpu] # in caffe2/CMakeLists.txt +<<<<<<< HEAD list(APPEND ATen_CUDA_DEPENDENCY_LIBS "${BLAS_LIBRARIES};${BLAS_LIBRARIES};${BLAS_LIBRARIES}") +======= + list(APPEND ATen_CUDA_DEPENDENCY_LIBS ${BLAS_LIBRARIES}) +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) endif($ENV{TH_BINARY_BUILD}) endif(MSVC) endif(USE_MAGMA) @@ -689,7 +768,11 @@ install(FILES "${CMAKE_CURRENT_BINARY_DIR}/cmake-exports/ATenConfig.cmake" set(INSTALL_HEADERS ${base_h} ${ATen_CORE_HEADERS} ${native_nested_h} ${ATen_TRANSFORMER_HEADERS}) if(NOT INTERN_BUILD_MOBILE) +<<<<<<< HEAD list(APPEND INSTALL_HEADERS ${native_h} ${native_cpu_h} ${native_ao_sparse_h} ${native_quantized_h} ${cuda_h} ${native_cuda_h} ${native_hip_h} ${cudnn_h} ${hip_h} ${xpu_h} ${mps_h} ${native_kleidiai_h} ${native_mps_h} ${native_utils_h} ${miopen_h} ${mkldnn_xpu_h}) +======= + list(APPEND INSTALL_HEADERS ${native_h} ${native_cpu_h} ${native_ao_sparse_h} ${native_quantized_h} ${cuda_h} ${native_cuda_h} ${native_hip_h} ${native_mtia_h} ${cudnn_h} ${hip_h} ${mtia_h} ${xpu_h} ${mps_h} ${native_kleidiai_h} ${native_mps_h} ${native_utils_h} ${miopen_h} ${mkldnn_xpu_h}) +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # Metal if(USE_PYTORCH_METAL_EXPORT) # Add files needed from exporting metal models(optimized_for_mobile) @@ -760,6 +843,10 @@ set(ATen_CUDA_CU_SRCS_W_SORT_BY_KEY ${ATen_CUDA_CU_SRCS_W_SORT_BY_KEY} PARENT_SC set(ATen_NVRTC_STUB_SRCS ${ATen_NVRTC_STUB_SRCS} PARENT_SCOPE) set(ATen_HIP_SRCS ${ATen_HIP_SRCS} PARENT_SCOPE) set(ATen_MPS_SRCS ${ATen_MPS_SRCS} PARENT_SCOPE) +<<<<<<< HEAD +======= +set(ATen_MTIA_SRCS ${ATen_MTIA_SRCS} PARENT_SCOPE) +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) set(ATen_XPU_SRCS ${ATen_XPU_SRCS} PARENT_SCOPE) set(ATen_QUANTIZED_SRCS ${ATen_QUANTIZED_SRCS} PARENT_SCOPE) set(ATen_CPU_TEST_SRCS ${ATen_CPU_TEST_SRCS} PARENT_SCOPE) diff --git a/aten/src/ATen/CPUGeneratorImpl.cpp b/aten/src/ATen/CPUGeneratorImpl.cpp index 4bbe3624a5b0..08ad9b003fd9 100644 --- a/aten/src/ATen/CPUGeneratorImpl.cpp +++ b/aten/src/ATen/CPUGeneratorImpl.cpp @@ -69,7 +69,11 @@ Generator createCPUGenerator(uint64_t seed_val) { * Helper function to concatenate two 32 bit unsigned int * and return them as a 64 bit unsigned int */ +<<<<<<< HEAD inline uint64_t make64BitsFrom32Bits(uint32_t hi, uint32_t lo) { +======= +inline static uint64_t make64BitsFrom32Bits(uint32_t hi, uint32_t lo) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return (static_cast(hi) << 32) | lo; } diff --git a/aten/src/ATen/Context.cpp b/aten/src/ATen/Context.cpp index 08e49d5e1b57..26d61641cdc3 100644 --- a/aten/src/ATen/Context.cpp +++ b/aten/src/ATen/Context.cpp @@ -335,15 +335,28 @@ at::BlasBackend Context::blasPreferredBackend() { static const bool hipblaslt_preferred = []() { static const std::vector archs = { "gfx90a", "gfx942", +<<<<<<< HEAD #if ROCM_VERSION >= 60300 "gfx1200", "gfx1201", #endif +======= +#if ROCM_VERSION >= 60400 + "gfx1200", "gfx1201", +#endif +#if ROCM_VERSION >= 60402 + "gfx1150", "gfx1151", +#endif +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #if ROCM_VERSION >= 60500 "gfx950" #endif }; for (auto index: c10::irange(detail::getCUDAHooks().deviceCount())) { +<<<<<<< HEAD if (!detail::getCUDAHooks().isGPUArch(index, archs)) { +======= + if (!detail::getCUDAHooks().isGPUArch(archs, index)) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return false; } } @@ -364,12 +377,22 @@ at::BlasBackend Context::blasPreferredBackend() { #if ROCM_VERSION >= 60300 "gfx1100", "gfx1101", "gfx1200", "gfx1201", #endif +<<<<<<< HEAD +======= +#if ROCM_VERSION >= 60402 + "gfx1150", "gfx1151", +#endif +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #if ROCM_VERSION >= 60500 "gfx950" #endif }; for (auto index: c10::irange(detail::getCUDAHooks().deviceCount())) { +<<<<<<< HEAD if (!detail::getCUDAHooks().isGPUArch(index, archs)) { +======= + if (!detail::getCUDAHooks().isGPUArch(archs, index)) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) TORCH_WARN_ONCE( "Attempting to use hipBLASLt on an unsupported architecture! " "Overriding blas backend to hipblas"); @@ -422,7 +445,11 @@ void Context::setROCmFAPreferredBackend(at::ROCmFABackend b) { "gfx90a", "gfx942", "gfx950" }; for (auto index: c10::irange(detail::getCUDAHooks().deviceCount())) { +<<<<<<< HEAD if (!detail::getCUDAHooks().isGPUArch(index, archs)) { +======= + if (!detail::getCUDAHooks().isGPUArch(archs, index)) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) TORCH_WARN_ONCE( "Attempting to use CK on an unsupported architecture! Cannot set backend to CK"); return true; @@ -618,7 +645,11 @@ Allocator* getCPUAllocator() { // means the allow_tf32 flags are overridden and tf32 is force disabled // override_allow_tf32_flag = false // means the original allow_tf32 flags are followed +<<<<<<< HEAD thread_local bool override_allow_tf32_flag = false; +======= +thread_local static bool override_allow_tf32_flag = false; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) NoTF32Guard::NoTF32Guard() { if (!override_allow_tf32_flag) { @@ -641,7 +672,11 @@ bool NoTF32Guard::should_disable_tf32() { // This information can be used, for example, to select implementations // with different numerical or performance characteristics. // See https://pytorch.org/docs/stable/notes/numerical_accuracy.html for details. +<<<<<<< HEAD thread_local bool rocm_is_backward_pass; +======= +thread_local static bool rocm_is_backward_pass; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ROCmBackwardPassGuard::ROCmBackwardPassGuard() { rocm_is_backward_pass = true; @@ -697,7 +732,11 @@ void Context::setAllowFP16ReductionCPU(bool b) { #else if (true) #endif +<<<<<<< HEAD throw std::runtime_error("Float16 arithmetic is not supported by the CPU!"); +======= + TORCH_CHECK(false, "Float16 arithmetic is not supported by the CPU!"); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } allow_fp16_reduction_cpu = b; } diff --git a/aten/src/ATen/Context.h b/aten/src/ATen/Context.h index 7d0f4c445f38..5ccbdb346cd2 100644 --- a/aten/src/ATen/Context.h +++ b/aten/src/ATen/Context.h @@ -550,7 +550,12 @@ inline size_t getNumGPUs() { // devices for a specific device type, add that function to the // relevant library (e.g., similar to at::cuda::device_count()) if (hasCUDA() && hasHIP()) { +<<<<<<< HEAD throw std::runtime_error( +======= + TORCH_CHECK( + false, +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) "Enabling both CUDA and HIP in ATen is not supported, as HIP masquerades " "to be CUDA (e.g., when you say CUDA, on a HIP build of ATen, this actually " "means HIP. Rebuild PyTorch with one or the other disabled."); diff --git a/aten/src/ATen/DLConvertor.cpp b/aten/src/ATen/DLConvertor.cpp index 2d16299c780d..8dda0c3acfbc 100644 --- a/aten/src/ATen/DLConvertor.cpp +++ b/aten/src/ATen/DLConvertor.cpp @@ -71,6 +71,12 @@ DLDataType getDLDataType(const Tensor& t) { case ScalarType::Float8_e8m0fnu: TORCH_CHECK(false, "float8 types are not supported by dlpack"); break; +<<<<<<< HEAD +======= + case ScalarType::Float4_e2m1fn_x2: + TORCH_CHECK(false, "float4 types are not supported by dlpack"); + break; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) case ScalarType::QInt8: case ScalarType::QUInt8: case ScalarType::QInt32: diff --git a/aten/src/ATen/DeviceAccelerator.cpp b/aten/src/ATen/DeviceAccelerator.cpp index 7efa561e1801..4e4927cd9a06 100644 --- a/aten/src/ATen/DeviceAccelerator.cpp +++ b/aten/src/ATen/DeviceAccelerator.cpp @@ -76,7 +76,11 @@ c10::DeviceIndex deviceCount() { return static_cast(0); } c10::impl::VirtualGuardImpl impl(device_type.value()); +<<<<<<< HEAD return static_cast(impl.deviceCount()); +======= + return impl.deviceCount(); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } void setDeviceIndex(c10::DeviceIndex device_index) { @@ -88,7 +92,11 @@ void setDeviceIndex(c10::DeviceIndex device_index) { c10::DeviceIndex getDeviceIndex() { const auto device_type = getAccelerator(true).value(); c10::impl::VirtualGuardImpl impl(device_type); +<<<<<<< HEAD return static_cast(impl.getDevice().index()); +======= + return impl.getDevice().index(); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } void setCurrentStream(c10::Stream stream) { @@ -115,6 +123,24 @@ void synchronizeDevice(c10::DeviceIndex device_index) { // impl.synchronizeDevice should can be safely called from any device impl.synchronizeDevice(device_index); } +<<<<<<< HEAD +======= + +c10::DeviceIndex exchangeDevice(c10::DeviceIndex device_index) { + const auto device_type = getAccelerator(true).value(); + c10::impl::VirtualGuardImpl impl(device_type); + return impl.exchangeDevice({device_type, device_index}).index(); +} + +c10::DeviceIndex maybeExchangeDevice(c10::DeviceIndex device_index) { + const auto device_type = getAccelerator(true).value(); + c10::impl::VirtualGuardImpl impl(device_type); + // Avoid creating a new context if the context for the given device_index + // is not initialized. + impl.uncheckedSetDevice({device_type, device_index}); + return impl.getDevice().index(); +} +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // NOLINTEND(bugprone-unchecked-optional-access) } // namespace at::accelerator diff --git a/aten/src/ATen/DeviceAccelerator.h b/aten/src/ATen/DeviceAccelerator.h index 60e74a90d604..6b57ce8e2d39 100644 --- a/aten/src/ATen/DeviceAccelerator.h +++ b/aten/src/ATen/DeviceAccelerator.h @@ -26,6 +26,7 @@ TORCH_API std::optional getAccelerator(bool checked = false); // Check if the given device type is an accelerator. TORCH_API bool isAccelerator(c10::DeviceType device_type); +<<<<<<< HEAD // Check if the given device type is an accelerator, not an excluded one. TORCH_API inline bool isAcceleratorExcluded( c10::DeviceType device_type, @@ -33,6 +34,8 @@ TORCH_API inline bool isAcceleratorExcluded( return device_type != excluded && isAccelerator(device_type); } +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // Check if the given device type is an accelerator, not the excluded ones. template < typename... T, @@ -41,8 +44,17 @@ TORCH_API inline bool isAcceleratorExcluded( c10::DeviceType device_type, c10::DeviceType first_excluded, T... rest_excluded) { +<<<<<<< HEAD return device_type != first_excluded && isAcceleratorExcluded(device_type, rest_excluded...); +======= + if constexpr (sizeof...(rest_excluded) > 0) { + return device_type != first_excluded && + isAcceleratorExcluded(device_type, rest_excluded...); + } else { + return device_type != first_excluded && isAccelerator(device_type); + } +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } // Return the number of the device available. Note that this is *REQUIRED* to @@ -66,6 +78,18 @@ TORCH_API c10::Stream getCurrentStream(c10::DeviceIndex device_index); // on the given device index has been completed. TORCH_API void synchronizeDevice(c10::DeviceIndex device_index); +<<<<<<< HEAD +======= +// Set the current device index to the given device_index and return the +// original device index that was active before the change. +TORCH_API c10::DeviceIndex exchangeDevice(c10::DeviceIndex device_index); + +// Set the current device index to the given device_index. Avoid creating a new +// context if the context for device_index is not initialized. Return the +// original device index that was active before the change. +TORCH_API c10::DeviceIndex maybeExchangeDevice(c10::DeviceIndex device_index); + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } // namespace at::accelerator namespace at { diff --git a/aten/src/ATen/Dispatch.h b/aten/src/ATen/Dispatch.h index 5c7b39c6427a..6bdc3d9e2f5e 100644 --- a/aten/src/ATen/Dispatch.h +++ b/aten/src/ATen/Dispatch.h @@ -200,7 +200,11 @@ inline at::ScalarType scalar_type(at::ScalarType s) { switch (_st) { \ __VA_ARGS__ \ default: \ +<<<<<<< HEAD TORCH_CHECK( \ +======= + TORCH_CHECK_NOT_IMPLEMENTED( \ +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) false, \ '"', \ at_dispatch_name, \ diff --git a/aten/src/ATen/EmptyTensor.cpp b/aten/src/ATen/EmptyTensor.cpp index 5361d6b2d0c3..9199936eee87 100644 --- a/aten/src/ATen/EmptyTensor.cpp +++ b/aten/src/ATen/EmptyTensor.cpp @@ -28,8 +28,12 @@ c10::Allocator* GetCPUAllocatorMaybePinned(bool pin_memory) { opt_device_type = at::getAccelerator(false); } if (opt_device_type.has_value()) { +<<<<<<< HEAD return at::globalContext().getPinnedMemoryAllocator( opt_device_type.value()); +======= + return at::globalContext().getPinnedMemoryAllocator(opt_device_type); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } else { TORCH_CHECK( false, "Need to provide pin_memory allocator to use pin memory.") @@ -160,17 +164,33 @@ SymInt computeStorageNbytes( // of the last element according to stride SymInt size = 1; for (const auto i : c10::irange(sizes.size())) { +<<<<<<< HEAD if (TORCH_GUARD_SIZE_OBLIVIOUS(sizes[i].sym_eq(0))) { return 0; } +======= + if (TORCH_GUARD_OR_FALSE(sizes[i].sym_eq(0))) { + return 0; + } + + // NOTE: while this can technically return negative sizes for + // 0-element tensors, there's a check in TensorShape:set_storage_meta__symint + // that skips setting nbytes with unbacked expressions. + // Would probably be safer to wrap this with a max(*, 0), + // once our min/max symbolic reasoning improves. +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) size += strides[i] * (sizes[i] - 1); } return itemsize_bytes * (storage_offset + size); } template +<<<<<<< HEAD TensorBase _empty_generic( +======= +static TensorBase _empty_generic( +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ArrayRef size, c10::Allocator* allocator, c10::DispatchKeySet ks, @@ -223,7 +243,11 @@ TensorBase empty_generic_symint( } template +<<<<<<< HEAD TensorBase _empty_strided_generic( +======= +static TensorBase _empty_strided_generic( +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) T size, T stride, c10::Allocator* allocator, diff --git a/aten/src/ATen/ExpandUtils.cpp b/aten/src/ATen/ExpandUtils.cpp index 5f6be741ce01..a2354b3d88ae 100644 --- a/aten/src/ATen/ExpandUtils.cpp +++ b/aten/src/ATen/ExpandUtils.cpp @@ -59,7 +59,11 @@ SymDimVector infer_size_symdimvector(SymIntArrayRef a, SymIntArrayRef b) { } template +<<<<<<< HEAD C10_ALWAYS_INLINE InferExpandGeometryResult inferExpandGeometryImpl( +======= +C10_ALWAYS_INLINE static InferExpandGeometryResult inferExpandGeometryImpl( +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) IntArrayRef tensor_sizes, IntArrayRef tensor_strides, IntArrayRef sizes) { diff --git a/aten/src/ATen/ExpandUtils.h b/aten/src/ATen/ExpandUtils.h index e9abc85b59c3..c2348306b2f7 100644 --- a/aten/src/ATen/ExpandUtils.h +++ b/aten/src/ATen/ExpandUtils.h @@ -461,9 +461,23 @@ inline Tensor _sum_to( reduce_dims.push_back(i); } for (int64_t i = leading_dims; i < static_cast(sizes.size()); ++i) { +<<<<<<< HEAD if (TORCH_GUARD_SIZE_OBLIVIOUS(sym_eq(shape[i - leading_dims], 1)) && TORCH_GUARD_SIZE_OBLIVIOUS(sym_ne(sizes[i], 1))) { reduce_dims.push_back(i); +======= + if (TORCH_GUARD_OR_FALSE(sym_eq(shape[i - leading_dims], 1)) && + TORCH_GUARD_OR_TRUE(sym_ne(sizes[i], 1))) { + reduce_dims.push_back(i); + } else { + // if we assume no reduction due to unbacked we ensure that at runtime. + TORCH_MAYBE_SYM_CHECK( + sym_eq(shape[i - leading_dims], sizes[i]), + "non-reduction path was assumed due to unabcked symbols expected those two sizes to be the same:", + shape[i - leading_dims], + ", ", + sizes[i]) +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } } diff --git a/aten/src/ATen/FunctionalTensorWrapper.cpp b/aten/src/ATen/FunctionalTensorWrapper.cpp index 409f944a88e3..97316bfd6b68 100644 --- a/aten/src/ATen/FunctionalTensorWrapper.cpp +++ b/aten/src/ATen/FunctionalTensorWrapper.cpp @@ -737,7 +737,11 @@ bool isFunctionalTensor(const c10::List<::std::optional>& t_list) { } template +<<<<<<< HEAD bool isFunctionalTensorIListRef(c10::IListRef list) { +======= +static bool isFunctionalTensorIListRef(c10::IListRef list) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if (list.size() == 0) return false; auto functional_count = 0; for (const auto& tensor : list) { @@ -803,7 +807,11 @@ void set_sizes_strides_offset(const std::vector& outs, const std::vector } } +<<<<<<< HEAD thread_local bool _functionalizationReapplyViews; +======= +thread_local static bool _functionalizationReapplyViews; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) bool getFunctionalizationReapplyViewsTLS() { return _functionalizationReapplyViews; diff --git a/aten/src/ATen/FunctionalizeFallbackKernel.cpp b/aten/src/ATen/FunctionalizeFallbackKernel.cpp index 36b6f91c1d99..63d26e7042d6 100644 --- a/aten/src/ATen/FunctionalizeFallbackKernel.cpp +++ b/aten/src/ATen/FunctionalizeFallbackKernel.cpp @@ -7,6 +7,10 @@ #include #include #include +<<<<<<< HEAD +======= +#include +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #ifndef AT_PER_OPERATOR_HEADERS #include @@ -315,8 +319,38 @@ static at::Tensor _unsafe_view_functionalize(const at::Tensor & self, at::SymInt // See Note [Propagating strides in the functionalization pass] // (for _unsafe_view, I'm just manually doing the shape inference rule here instead of calling the meta function for unsafe_view) auto inferred_size = at::infer_size_dv(size, self.sym_numel()); +<<<<<<< HEAD auto stride = at::detail::computeStride(self.sym_sizes(), self.sym_strides(), inferred_size); TORCH_INTERNAL_ASSERT(stride.has_value()); +======= + + auto stride = at::detail::computeStride(self.sym_sizes(), self.sym_strides(), inferred_size); + + if (!stride.has_value()) { + // With unbacked symints, computeStride could fail even on contiguous + // tensors. In this case, we can use the strides of an empty tensor of + // inferred_size. + TORCH_CHECK( + self.is_contiguous(), + "View is not valid from size:", + self.sym_sizes(), + " stride: ", + self.sym_strides(), + " to shape: ", + inferred_size, + " in case of unbacked symbols consider adding torch.check to guide computing strides."); + + stride = at::detail::empty_symint_meta( + inferred_size, + std::nullopt, + std::nullopt, + std::nullopt, + std::nullopt, + std::nullopt) + .sym_strides(); + } + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) out.unsafeGetTensorImpl()->set_sizes_and_strides(inferred_size, stride.value()); return out; } diff --git a/aten/src/ATen/InferSize.h b/aten/src/ATen/InferSize.h index 3bcccfad971c..b7adde7ca6fd 100644 --- a/aten/src/ATen/InferSize.h +++ b/aten/src/ATen/InferSize.h @@ -25,11 +25,16 @@ inline void infer_size_impl( // N.B. this is an index, not a sym dim! std::optional infer_dim; for (int64_t dim = 0, ndim = shape.size(); dim != ndim; dim++) { +<<<<<<< HEAD if (shape[dim] == -1) { +======= + if (TORCH_GUARD_OR_FALSE(sym_eq(shape[dim], -1))) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if (infer_dim) { throw std::runtime_error("only one dimension can be inferred"); } infer_dim = dim; +<<<<<<< HEAD } else if (shape[dim] >= 0) { newsize *= shape[dim]; } else { @@ -62,6 +67,56 @@ inline void infer_size_impl( std::ostringstream ss; ss << "shape '" << shape << "' is invalid for input of size " << numel; throw std::runtime_error(ss.str()); +======= + } else { + // in case of unbacked shape[dim] we assume it's not -1 and add a runtime + // assertion. + TORCH_MAYBE_SYM_CHECK( + sym_gt(shape[dim], -1), + "invalid shape dimension ", + shape[dim], + " at index ", + dim, + " of shape ", + shape); + newsize *= shape[dim]; + } + } + + auto set_infer_dim = [&]() { + // We have a degree of freedom here to select the dimension size; follow + // NumPy semantics and just bail. However, a nice error message is needed + // because users often use `view` as a way to flatten & unflatten + // dimensions and will otherwise be confused why + // empty_tensor.view( 0, 0) + // works yet + // empty_tensor.view(-1, 0) + // doesn't. + TORCH_CHECK( + newsize != 0, + "cannot reshape tensor of 0 elements into shape ", + shape, + " because the unspecified dimension size -1 can be any " + "value and is ambiguous"); + res[*infer_dim] = numel / newsize; + return; + }; + + if (infer_dim && newsize > 0 && numel % newsize == 0) { + set_infer_dim(); + return; + } + + TORCH_MAYBE_SYM_CHECK( + sym_eq(numel, newsize), + "shape '", + shape, + "' is invalid for input of size ", + numel); + if (infer_dim) { + set_infer_dim(); + } +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } inline std::vector infer_size(IntArrayRef shape, int64_t numel) { diff --git a/aten/src/ATen/LegacyVmapMode.cpp b/aten/src/ATen/LegacyVmapMode.cpp index 731f8cbafd82..7e5eab1cb93a 100644 --- a/aten/src/ATen/LegacyVmapMode.cpp +++ b/aten/src/ATen/LegacyVmapMode.cpp @@ -2,7 +2,11 @@ namespace at::impl { +<<<<<<< HEAD thread_local int64_t VmapMode_current_vmap_level = 0; +======= +thread_local static int64_t VmapMode_current_vmap_level = 0; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) int64_t VmapMode::current_vmap_level() { return VmapMode_current_vmap_level; diff --git a/aten/src/ATen/MemoryOverlap.cpp b/aten/src/ATen/MemoryOverlap.cpp index 61336037d71b..15e59033f2da 100644 --- a/aten/src/ATen/MemoryOverlap.cpp +++ b/aten/src/ATen/MemoryOverlap.cpp @@ -35,7 +35,11 @@ MemOverlap has_internal_overlap(TensorImpl* t) { // SymInts. Thus, if I have u0 size, we should assume that this has > 1 // elements (first expression), but if I have a u0 stride, I should NOT // assume that it is not zero (second expression) +<<<<<<< HEAD if (TORCH_GUARD_SIZE_OBLIVIOUS(sizes[i].sym_gt(1)) && strides[i] == 0) { +======= + if (TORCH_GUARD_OR_FALSE(sizes[i].sym_gt(1)) && strides[i] == 0) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return MemOverlap::Yes; } } diff --git a/aten/src/ATen/NestedTensorImpl.cpp b/aten/src/ATen/NestedTensorImpl.cpp index b64ac79bc9f5..93b16ed17f08 100644 --- a/aten/src/ATen/NestedTensorImpl.cpp +++ b/aten/src/ATen/NestedTensorImpl.cpp @@ -71,7 +71,11 @@ c10::DispatchKeySet get_view_key_set(const at::Tensor& base) { namespace at::native { +<<<<<<< HEAD inline std::vector construct_opt_sizes(const at::Tensor& sizes) { +======= +inline static std::vector construct_opt_sizes(const at::Tensor& sizes) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // torch.tensor([]) is considered to have `dim() = 1` and `size(0) = 0` // torch.nested_tensor([]) should also has `dim() = 1` and `size(0) = 0` if (sizes.dim() == 0) { @@ -182,7 +186,11 @@ NestedTensorImpl::NestedTensorImpl( "coverage, and works with torch.compile."); auto storage_device = storage_.device(); TORCH_INTERNAL_ASSERT( +<<<<<<< HEAD storage_device.is_cpu() || storage_device.is_cuda() || storage_device.is_xpu() || storage_device.is_privateuseone(), +======= + storage_device.is_cpu() || storage_device.is_cuda() || storage_device.is_xpu() || storage_device.is_hpu() || storage_device.is_privateuseone(), +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) "NestedTensorImpl storage must be either CUDA, CPU, XPU or ", get_privateuse1_backend(), " but got ", storage_device); validate_nested_tensor_metadata(nested_sizes_, nested_strides_, storage_offsets_); diff --git a/aten/src/ATen/OpMathType.h b/aten/src/ATen/OpMathType.h index d00195b07e49..0cdb18eea703 100644 --- a/aten/src/ATen/OpMathType.h +++ b/aten/src/ATen/OpMathType.h @@ -41,6 +41,13 @@ struct OpMathType { using type = float; }; template <> +<<<<<<< HEAD +======= +struct OpMathType { + using type = float; +}; +template <> +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) struct OpMathType> { using type = c10::complex; }; diff --git a/aten/src/ATen/OpaqueTensorImpl.h b/aten/src/ATen/OpaqueTensorImpl.h index f9f69aa3c42b..254efbdd543c 100644 --- a/aten/src/ATen/OpaqueTensorImpl.h +++ b/aten/src/ATen/OpaqueTensorImpl.h @@ -29,12 +29,29 @@ struct TORCH_API OpaqueTensorImpl : public TensorImpl { bool is_non_overlapping_and_dense = true) : TensorImpl(key_set, data_type, device), opaque_handle_(std::move(opaque_handle)) { +<<<<<<< HEAD set_storage_access_should_throw(); set_custom_sizes_strides(SizesStridesPolicy::CustomStrides); sizes_and_strides_.set_sizes(sizes); refresh_numel(); // NOLINTNEXTLINE(cppcoreguidelines-prefer-member-initializer) is_non_overlapping_and_dense_ = is_non_overlapping_and_dense; +======= + constructor_impl(sizes, is_non_overlapping_and_dense); + } + + OpaqueTensorImpl( + TensorImpl::ImplType impl_type, + c10::Storage&& storage, + at::DispatchKeySet key_set, + const caffe2::TypeMeta data_type, + OpaqueHandle opaque_handle, + c10::IntArrayRef sizes, + bool is_non_overlapping_and_dense = true) + : TensorImpl(impl_type, std::move(storage), key_set, data_type), + opaque_handle_(std::move(opaque_handle)) { + constructor_impl(sizes, is_non_overlapping_and_dense); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } // Destructor doesn't call release_resources because it's @@ -181,6 +198,20 @@ struct TORCH_API OpaqueTensorImpl : public TensorImpl { return "OpaqueTensorImpl"; } +<<<<<<< HEAD +======= + void constructor_impl( + c10::IntArrayRef sizes, + bool is_non_overlapping_and_dense) { + set_storage_access_should_throw(); + set_custom_sizes_strides(SizesStridesPolicy::CustomStrides); + sizes_and_strides_.set_sizes(sizes); + refresh_numel(); + // NOLINTNEXTLINE(cppcoreguidelines-prefer-member-initializer) + is_non_overlapping_and_dense_ = is_non_overlapping_and_dense; + } + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) OpaqueHandle opaque_handle_; }; diff --git a/aten/src/ATen/ParallelNative.cpp b/aten/src/ATen/ParallelNative.cpp index 699c47e36725..db6867681e80 100644 --- a/aten/src/ATen/ParallelNative.cpp +++ b/aten/src/ATen/ParallelNative.cpp @@ -222,8 +222,12 @@ void set_num_threads(int nthreads) { int stored_nthreads = num_intraop_threads.load(); if (stored_nthreads <= 0) { // plus one because of master thread +<<<<<<< HEAD // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions) stored_nthreads = _get_intraop_pool().size() + 1; +======= + stored_nthreads = static_cast(_get_intraop_pool().size() + 1); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } if (stored_nthreads != nthreads) { TORCH_WARN( @@ -251,8 +255,12 @@ int get_num_threads() { return intraop_default_num_threads(); } else { TORCH_INTERNAL_ASSERT(nthreads == CONSUMED); +<<<<<<< HEAD // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions) return _get_intraop_pool().size() + 1; +======= + return static_cast(_get_intraop_pool().size() + 1); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } #else caffe2::PThreadPool* const pool = caffe2::pthreadpool(); diff --git a/aten/src/ATen/ParallelOpenMP.cpp b/aten/src/ATen/ParallelOpenMP.cpp index 388cbb1a4b9f..2e09fba48f69 100644 --- a/aten/src/ATen/ParallelOpenMP.cpp +++ b/aten/src/ATen/ParallelOpenMP.cpp @@ -10,6 +10,7 @@ #include #endif +<<<<<<< HEAD #include namespace at { @@ -19,6 +20,15 @@ namespace native::mkldnn { void clear_computation_cache(); } // namespace native::mkldnn #endif +======= +#if AT_MKLDNN_ENABLED() +#include +#endif + +#include + +namespace at { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) namespace { // Number of threads set by the user diff --git a/aten/src/ATen/SavedTensorHooks.cpp b/aten/src/ATen/SavedTensorHooks.cpp index 0313849f788f..b9a3603cb663 100644 --- a/aten/src/ATen/SavedTensorHooks.cpp +++ b/aten/src/ATen/SavedTensorHooks.cpp @@ -26,9 +26,15 @@ bool SavedTensorDefaultHooks::is_enabled() { return !tls.disabled_error_message.has_value(); } +<<<<<<< HEAD void SavedTensorDefaultHooks::disable(const std::string& message) { tls.disabled_error_message = message; if (!tls.stack.empty()) { +======= +void SavedTensorDefaultHooks::disable(const std::string& message, const bool fail_if_non_empty) { + tls.disabled_error_message = message; + if (fail_if_non_empty && !tls.stack.empty()) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) assertSavedTensorHooksNotDisabled(); } } @@ -72,9 +78,15 @@ std::pair SavedTensorDefaultHooks::pop_hooks() { return hooks; } +<<<<<<< HEAD std::optional> SavedTensorDefaultHooks::get_hooks() { // For tls.is_tracing, see NOTE: [Deferring tensor pack/unpack hooks until runtime] if (!is_initialized || tls.stack.empty() || tls.is_tracing) { +======= +std::optional> SavedTensorDefaultHooks::get_hooks(bool ignore_is_tracing) { + // For tls.is_tracing, see NOTE: [Deferring tensor pack/unpack hooks until runtime] + if (!is_initialized || tls.stack.empty() || (!ignore_is_tracing && tls.is_tracing)) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return std::nullopt; } return tls.stack.top(); diff --git a/aten/src/ATen/SavedTensorHooks.h b/aten/src/ATen/SavedTensorHooks.h index 2803bdc64668..3dfc5a535a06 100644 --- a/aten/src/ATen/SavedTensorHooks.h +++ b/aten/src/ATen/SavedTensorHooks.h @@ -36,7 +36,11 @@ struct TORCH_API SavedTensorDefaultHooks { c10::SafePyObject unpack_hook); static std::pair pop_hooks(); static std::optional> +<<<<<<< HEAD get_hooks(); +======= + get_hooks(bool ignore_is_tracing = false); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) static void lazy_initialize(); static const impl::SavedTensorDefaultHooksTLS& get_tls_state(); @@ -48,7 +52,13 @@ struct TORCH_API SavedTensorDefaultHooks { // disabled, then the following will raise an error: // - Attempting to push_hooks // - calling disable(message) with a non-zero stack (hooks) size +<<<<<<< HEAD static void disable(const std::string& error_message); +======= + static void disable( + const std::string& error_message, + const bool fail_if_non_empty = true); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) static void enable(); static bool is_enabled(); static const std::optional& get_disabled_error_message(); diff --git a/aten/src/ATen/ScalarOps.cpp b/aten/src/ATen/ScalarOps.cpp index 693fb46e639f..c5a4447b932a 100644 --- a/aten/src/ATen/ScalarOps.cpp +++ b/aten/src/ATen/ScalarOps.cpp @@ -8,7 +8,32 @@ namespace at { namespace { template inline void fill_inplace(Tensor& self, const Scalar& value_scalar) { +<<<<<<< HEAD auto value = value_scalar.to(); +======= + scalar_t value{}; + + if constexpr (std::is_same_v || + std::is_same_v || + std::is_same_v || + std::is_same_v || + std::is_same_v || + std::is_same_v || + std::is_same_v) { + // relaxed float cast: allow inf similar to the torch.tensor constructor + // + // without this, we had the following divergence: + // torch.tensor(1123581321.0, dtype=torch.float16) + // => tensor(inf, dtype=torch.float16) + // torch.ops.aten.scalar_tensor.default(1123581321, dtype=torch.float16) + // => RuntimeError: value cannot be converted to type at::Half without overflow + + value = static_cast(value_scalar.to()); + } else { + value = value_scalar.to(); + } + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) scalar_t* dptr = static_cast(self.data_ptr()); *dptr = value; } diff --git a/aten/src/ATen/SparseTensorImpl.cpp b/aten/src/ATen/SparseTensorImpl.cpp index 2a3b9481255f..8a64d62040c2 100644 --- a/aten/src/ATen/SparseTensorImpl.cpp +++ b/aten/src/ATen/SparseTensorImpl.cpp @@ -108,7 +108,11 @@ void SparseTensorImpl::set_indices_and_values_unsafe(const Tensor& indices, cons AT_ASSERT(device() == values_.device()); AT_ASSERT(values_.device() == indices_.device()); +<<<<<<< HEAD coalesced_ = TORCH_GUARD_SIZE_OBLIVIOUS(sym_nnz().sym_lt(2)); +======= + coalesced_ = TORCH_GUARD_OR_FALSE(sym_nnz().sym_lt(2)); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } diff --git a/aten/src/ATen/TensorGeometry.cpp b/aten/src/ATen/TensorGeometry.cpp index cc6fc929f533..8dd30590eb26 100644 --- a/aten/src/ATen/TensorGeometry.cpp +++ b/aten/src/ATen/TensorGeometry.cpp @@ -5,7 +5,11 @@ namespace at { // See TensorGeometry.h on why this is useful now that we cache is_contiguous. template +<<<<<<< HEAD bool _geometry_is_contiguous(ArrayRef sizes, ArrayRef strides) { +======= +static bool _geometry_is_contiguous(ArrayRef sizes, ArrayRef strides) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) assert(!overflows(sizes.size())); auto dim = static_cast(sizes.size()); T expected_stride = 1; diff --git a/aten/src/ATen/TensorIndexing.h b/aten/src/ATen/TensorIndexing.h index 38fe78901ce7..a090164a8bef 100644 --- a/aten/src/ATen/TensorIndexing.h +++ b/aten/src/ATen/TensorIndexing.h @@ -222,8 +222,13 @@ inline Tensor applySlice( ? (*self_sizes)[dim] : self.sym_size(dim); if (!disable_slice_optimization && +<<<<<<< HEAD TORCH_GUARD_SIZE_OBLIVIOUS(start.sym_eq(0)) && TORCH_GUARD_SIZE_OBLIVIOUS(length.sym_eq(stop)) && step == 1) { +======= + TORCH_STATICALLY_KNOWN_TRUE(start.sym_eq(0)) && + TORCH_STATICALLY_KNOWN_TRUE(length.sym_eq(stop)) && step == 1) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return self; } } diff --git a/aten/src/ATen/TensorIterator.cpp b/aten/src/ATen/TensorIterator.cpp index 805f1f2f6c2e..7a6d5f8e30db 100644 --- a/aten/src/ATen/TensorIterator.cpp +++ b/aten/src/ATen/TensorIterator.cpp @@ -1388,7 +1388,11 @@ bool TensorIteratorBase::fast_set_up(const TensorIteratorConfig& config) { case FastSetupType::NON_OVERLAPPING_DENSE: { // find the index of a defined tensor in operands_ start from input tensor +<<<<<<< HEAD int i_defined; // NOLINT(cppcoreguidelines-init-variables) +======= + int i_defined = -1; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) for (i_defined = ntensors() - 1; i_defined >= 0; --i_defined) { if (tensor(i_defined).defined()) break; } @@ -1535,7 +1539,10 @@ void TensorIteratorBase::build(TensorIteratorConfig& config) { // Nothing beyond this point is important for meta functions, so it's fine to exit early here. // Extend the condition to MAIA tesnors as MAIA tensors also don't have storage. if (privateuse1_without_storage || +<<<<<<< HEAD common_device_.type() == DeviceType::MTIA || +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) common_device_.type() == DeviceType::XLA || common_device_.type() == DeviceType::IPU || common_device_.type() == DeviceType::Lazy || diff --git a/aten/src/ATen/TensorUtils.cpp b/aten/src/ATen/TensorUtils.cpp index 399164688f86..0e23a5b9824d 100644 --- a/aten/src/ATen/TensorUtils.cpp +++ b/aten/src/ATen/TensorUtils.cpp @@ -327,7 +327,11 @@ std::vector defaultStrides(IntArrayRef sizes) { // see overloads of computeStride() below. // template +<<<<<<< HEAD inline std::optional computeStride_impl( +======= +inline static std::optional computeStride_impl( +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) const NewShapeVec& oldshape, const NewShapeVec& oldstride, const NewShapeVec& newshape, @@ -343,7 +347,11 @@ inline std::optional computeStride_impl( // This could perhaps be combined with the below code, but the complexity // didn't seem worth it. const Numel numel = c10::multiply_integers(oldshape); +<<<<<<< HEAD bool zero_numel = TORCH_GUARD_SIZE_OBLIVIOUS(sym_eq(numel, 0)); +======= + bool zero_numel = TORCH_GUARD_OR_FALSE(sym_eq(numel, 0)); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if (zero_numel && oldshape.equals(newshape)) { return toResult(oldstride); } @@ -367,19 +375,47 @@ inline std::optional computeStride_impl( // numel in current chunk Numel tensor_numel = 1; Numel view_numel = 1; +<<<<<<< HEAD +======= + + // The usages of TORCH_GUARD_OR_TRUE/TORCH_GUARD_OR_FALSE below could result in returning + // std::nullopt which has an effect of falling back to a clone when unbacked symints are present. + // But it will not result in returning different or wrong results. +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) for (int64_t tensor_d = oldshape.size() - 1; tensor_d >= 0; tensor_d--) { tensor_numel *= oldshape[tensor_d]; // if end of tensor size chunk, check view if ((tensor_d == 0) || +<<<<<<< HEAD (TORCH_GUARD_SIZE_OBLIVIOUS(sym_ne(oldshape[tensor_d - 1], 1)) && TORCH_GUARD_SIZE_OBLIVIOUS(sym_ne(oldstride[tensor_d - 1], tensor_numel * chunk_base_stride)))) { while (view_d >= 0 && (TORCH_GUARD_SIZE_OBLIVIOUS(sym_lt(view_numel, tensor_numel)) || TORCH_GUARD_SIZE_OBLIVIOUS(sym_eq(newshape[view_d], 1)))) { +======= + (TORCH_GUARD_OR_TRUE(sym_ne(oldshape[tensor_d - 1], 1)) && + TORCH_GUARD_OR_TRUE(sym_ne(oldstride[tensor_d - 1], tensor_numel * chunk_base_stride)))) { + // We want to accumulate stuff in view_numel until view_numel == tensor_numel, if we do not + // know if that is satisfied we keep accumalating. For example if view_numel = 1 and tensor_numel = u1, + // we want to take that path, view_numel will become u0. Next iteration if u0==u1 we want to stop. + // Thats why we use TORCH_GUARD_OR_TRUE below. + + // we use TORCH_GUARD_OR_FALSE and not TORCH_GUARD_OR_TRUE when comparing newshape[view_d] ==1 because + // if we know view_numel < tensor_numel is false, we want to stop. Unless we know for sure newshape[view_d]==1 + // in that case we would stop in the next iteration anyway. For example, if view_numel = u0 and tensor_numel = u1, + // and u0==u1, then want to stop unless newshape[view_d]==1. taking one more iteration will keep [view_numel = u0 + // and tensor_numel = u1]. + while (view_d >= 0 && + (TORCH_GUARD_OR_TRUE(sym_lt(view_numel, tensor_numel)) || TORCH_GUARD_OR_FALSE(sym_eq(newshape[view_d], 1)))) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) newstride[view_d] = view_numel * chunk_base_stride; view_numel *= newshape[view_d]; view_d--; } +<<<<<<< HEAD if (TORCH_GUARD_SIZE_OBLIVIOUS(sym_ne(view_numel, tensor_numel))) { +======= + if (TORCH_GUARD_OR_TRUE(sym_ne(view_numel, tensor_numel))) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return std::nullopt; } if (tensor_d > 0) { diff --git a/aten/src/ATen/Version.cpp b/aten/src/ATen/Version.cpp index 51d5f2d6412f..b01b4697c281 100644 --- a/aten/src/ATen/Version.cpp +++ b/aten/src/ATen/Version.cpp @@ -105,7 +105,11 @@ std::string get_cpu_capability() { return "DEFAULT"; case native::CPUCapability::ZVECTOR: return "Z VECTOR"; +<<<<<<< HEAD #elif defined(HAVE_SVE_CPU_DEFINITION) +======= +#elif defined(HAVE_SVE256_CPU_DEFINITION) && defined(HAVE_ARM_BF16_CPU_DEFINITION) +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) case native::CPUCapability::DEFAULT: return "DEFAULT"; case native::CPUCapability::SVE256: diff --git a/aten/src/ATen/VmapModeRegistrations.cpp b/aten/src/ATen/VmapModeRegistrations.cpp index 5e137c7396ba..538c802b19cf 100644 --- a/aten/src/ATen/VmapModeRegistrations.cpp +++ b/aten/src/ATen/VmapModeRegistrations.cpp @@ -20,12 +20,20 @@ namespace at { // We haven't made a decision on that yet so we are temporarily banning random // operations inside of vmap while we gather user feedback. +<<<<<<< HEAD template Tensor unsupportedRandomOp(Args... args) { +======= +template static Tensor unsupportedRandomOp(Args... args) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) TORCH_CHECK(false, "vmap: We do not yet support calling random operations inside of vmap. ", "Please perform random operations outside of vmap as a workaround"); } +<<<<<<< HEAD template Tensor& unsupportedRandomOp_(Args... args) { +======= +template static Tensor& unsupportedRandomOp_(Args... args) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) TORCH_CHECK(false, "vmap: We do not yet support calling random operations inside of vmap. ", "Please perform random operations outside of vmap as a workaround"); } diff --git a/aten/src/ATen/autocast_mode.cpp b/aten/src/ATen/autocast_mode.cpp index 4fae147e2815..f23cdd51342e 100644 --- a/aten/src/ATen/autocast_mode.cpp +++ b/aten/src/ATen/autocast_mode.cpp @@ -64,7 +64,11 @@ thread_local std::array at::ScalarType::Undefined, // IDEEP. at::kHalf, // AMD HIP at::ScalarType::Undefined, // FPGA +<<<<<<< HEAD at::ScalarType::Undefined, // ONNX Runtime / Microsoft +======= + at::kBFloat16, // ONNX Runtime / Microsoft +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) at::kBFloat16, // XLA / TPU at::ScalarType::Undefined, // Vulkan at::ScalarType::Undefined, // Metal @@ -500,6 +504,47 @@ TORCH_LIBRARY_IMPL(aten, AutocastMTIA, m) { TORCH_FN((&at::autocast::binary_cross_entropy_banned))); } +<<<<<<< HEAD +======= +// MAIA +TORCH_LIBRARY_IMPL(_, AutocastMAIA, m) { + m.fallback(torch::CppFunction::makeFallthrough()); +} + +TORCH_LIBRARY_IMPL(aten, AutocastMAIA, m) { + // lower_precision_fp +#define _KERNEL_MAIA_LOW_PRECISION_FP(...) \ + KERNEL_MAIA(__VA_ARGS__, lower_precision_fp) + + AT_FORALL_LOWER_PRECISION_FP(_KERNEL_MAIA_LOW_PRECISION_FP) + + // fp32 +#define _KERNEL_MAIA_FP32(...) KERNEL_MAIA(__VA_ARGS__, fp32) + + AT_FORALL_FP32(_KERNEL_MAIA_FP32) + + // fp32_set_opt_dtype +#define _KERNEL_MAIA_FP32_SET_OPT_DTYPE(...) \ + KERNEL_MAIA(__VA_ARGS__, fp32_set_opt_dtype) + + AT_FORALL_FP32_SET_OPT_DTYPE(_KERNEL_MAIA_FP32_SET_OPT_DTYPE) + + // fp32_append_dtype + // The fp32_append_dtype wrapper overrides implicit promotion behavior. + // norm does not implicitly promote, but be aware when adding new ops to this policy. + AT_FORALL_DIFFERENT_REDISPATCH_SIGNATURE( + KERNEL_DIFFERENT_REDISPATCH_SIGNATURE_MAIA) + + // promote +#define _KERNEL_MAIA_PROMOTE(...) KERNEL_MAIA(__VA_ARGS__, promote) + + AT_FORALL_PROMOTE(_KERNEL_MAIA_PROMOTE) + + m.impl(TORCH_SELECTIVE_NAME("aten::binary_cross_entropy"), + TORCH_FN((&at::autocast::binary_cross_entropy_banned))); +} + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // XPU TORCH_LIBRARY_IMPL(_, AutocastXPU, m) { m.fallback(torch::CppFunction::makeFallthrough()); diff --git a/aten/src/ATen/autocast_mode.h b/aten/src/ATen/autocast_mode.h index ec30eb66834a..6b7b575b3c19 100644 --- a/aten/src/ATen/autocast_mode.h +++ b/aten/src/ATen/autocast_mode.h @@ -123,12 +123,23 @@ TORCH_API inline void set_autocast_gpu_dtype(at::ScalarType dtype) { _(privateuseone, at::kPrivateUse1) // deprecated other backend specific autocast APIs +<<<<<<< HEAD AT_FORALL_DEPRECATED_AUTOCAST_BACKENDS(DECLARE_DEPRECATED_AUTOCAST_APIS) const std::array _AUTOCAST_SUPPORTED_DEVICES{ at::kCPU, at::kCUDA, at::kMTIA, +======= +// NOLINTNEXTLINE(misc-use-internal-linkage) +AT_FORALL_DEPRECATED_AUTOCAST_BACKENDS(DECLARE_DEPRECATED_AUTOCAST_APIS) + +const std::array _AUTOCAST_SUPPORTED_DEVICES{ + at::kCPU, + at::kCUDA, + at::kMTIA, + at::kMAIA, +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) at::kXPU, at::kIPU, at::kHPU, @@ -149,6 +160,11 @@ inline bool is_autocast_eligible( tensor.is_floating_point(); case c10::DeviceType::MTIA: return tensor.is_mtia() && tensor.is_floating_point(); +<<<<<<< HEAD +======= + case c10::DeviceType::MAIA: + return tensor.is_maia() && tensor.is_floating_point(); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) case c10::DeviceType::XPU: return tensor.is_xpu() && tensor.is_floating_point(); case c10::DeviceType::IPU: @@ -176,6 +192,11 @@ inline DispatchKey get_autocast_dispatch_key_from_device_type( return DispatchKey::AutocastCPU; case c10::DeviceType::MTIA: return DispatchKey::AutocastMTIA; +<<<<<<< HEAD +======= + case c10::DeviceType::MAIA: + return DispatchKey::AutocastMAIA; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) case c10::DeviceType::XPU: return DispatchKey::AutocastXPU; case c10::DeviceType::IPU: @@ -189,7 +210,12 @@ inline DispatchKey get_autocast_dispatch_key_from_device_type( case c10::DeviceType::MPS: return DispatchKey::AutocastMPS; default: +<<<<<<< HEAD throw std::runtime_error( +======= + TORCH_CHECK( + false, +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) "unknown device type for autocast in get_autocast_dispatch_key_from_device_type"); } } @@ -210,7 +236,12 @@ inline at::ScalarType get_lower_precision_fp_from_device_type( if (is_autocast_available(device_type)) { return get_autocast_dtype(device_type); } else { +<<<<<<< HEAD throw std::runtime_error( +======= + TORCH_CHECK( + false, +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) "unknown device type for autocast in get_lower_precision_fp_from_device_type"); } } @@ -747,6 +778,27 @@ copy pasted in from VariableTypeEverything.cpp with appropriate substitutions. REDISPATCH_SIGNATURE, \ POLICY) +<<<<<<< HEAD +======= +// KERNEL_MAIA/KERNEL_DIFFERENT_REDISPATCH_SIGNATURE_MAIA +// registration (OP, POLICY) or (OP, OVERLOAD, POLICY) for AutocastMAIA +#define KERNEL_MAIA(...) KERNEL(c10::DeviceType::MAIA, __VA_ARGS__) + +#define KERNEL_DIFFERENT_REDISPATCH_SIGNATURE_MAIA( \ + REDISPATCH_FUNC, \ + REGISTER_NAME, \ + REGISTER_SIGNATURE, \ + REDISPATCH_SIGNATURE, \ + POLICY) \ + KERNEL_DIFFERENT_REDISPATCH_SIGNATURE( \ + c10::DeviceType::MAIA, \ + REDISPATCH_FUNC, \ + REGISTER_NAME, \ + REGISTER_SIGNATURE, \ + REDISPATCH_SIGNATURE, \ + POLICY) + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // KERNEL_XPU/KERNEL_DIFFERENT_REDISPATCH_SIGNATURE_XPU // registration (OP, POLICY) or (OP, OVERLOAD, POLICY) for AutocastXPU #define KERNEL_XPU(...) KERNEL(c10::DeviceType::XPU, __VA_ARGS__) diff --git a/aten/src/ATen/core/ATen_pch.h b/aten/src/ATen/core/ATen_pch.h index f10c191a4c1f..cd9b48cc0549 100644 --- a/aten/src/ATen/core/ATen_pch.h +++ b/aten/src/ATen/core/ATen_pch.h @@ -3,11 +3,14 @@ #pragma push_macro("TORCH_ASSERT_NO_OPERATORS") #define TORCH_ASSERT_NO_OPERATORS +<<<<<<< HEAD // This macro doesn't work if defined after the first time inttypes.h // is included, so won't work anywhere if not defined here. #ifndef __STDC_FORMAT_MACROS #define __STDC_FORMAT_MACROS #endif +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #include // This list of headers was generated using a script that finds diff --git a/aten/src/ATen/core/CachingHostAllocator.cpp b/aten/src/ATen/core/CachingHostAllocator.cpp new file mode 100644 index 000000000000..5939253caf55 --- /dev/null +++ b/aten/src/ATen/core/CachingHostAllocator.cpp @@ -0,0 +1,33 @@ +#include + +#include + +namespace at { + +namespace { + +static std::array + allocator_array{}; +static std::array + allocator_priority{}; + +} // anonymous namespace + +void setHostAllocator( + at::DeviceType device_type, + at::HostAllocator* allocator, + uint8_t priority) { + if (priority >= allocator_priority[static_cast(device_type)]) { + allocator_array[static_cast(device_type)] = allocator; + allocator_priority[static_cast(device_type)] = priority; + } +} + +at::HostAllocator* getHostAllocator(at::DeviceType device_type) { + auto* allocator = allocator_array[static_cast(device_type)]; + TORCH_INTERNAL_ASSERT_DEBUG_ONLY( + allocator, "Host Allocator for ", device_type, " is not set."); + return allocator; +} + +} // namespace at diff --git a/aten/src/ATen/core/CachingHostAllocator.h b/aten/src/ATen/core/CachingHostAllocator.h index 76981dff46b8..414cbfbb551e 100644 --- a/aten/src/ATen/core/CachingHostAllocator.h +++ b/aten/src/ATen/core/CachingHostAllocator.h @@ -1,4 +1,11 @@ +<<<<<<< HEAD #include +======= +#pragma once + +#include +#include +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #include #include #include @@ -46,7 +53,11 @@ namespace { } // Struct containing memory allocator summary statistics for host. +<<<<<<< HEAD struct HostStats { +======= +struct TORCH_API HostStats { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // COUNT: allocations requested by client code. Note that active // count can be extracted by looking at current allocations Stat allocation; @@ -174,7 +185,16 @@ template < typename E, typename B = HostBlock> struct CachingHostAllocatorImpl { +<<<<<<< HEAD virtual ~CachingHostAllocatorImpl() = default; +======= + virtual ~CachingHostAllocatorImpl() { + active_ = false; + if (pinned_use_background_threads()) { + getBackgroundThreadPool()->waitWorkComplete(); + } + } +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) public: // return data_ptr and block pair. @@ -211,7 +231,11 @@ struct CachingHostAllocatorImpl { // Launch the background thread and process events in a loop. static bool background_thread_flag [[maybe_unused]] = [this] { getBackgroundThreadPool()->run([&]() { +<<<<<<< HEAD while (true) { +======= + while (active_) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) process_events(); std::this_thread::sleep_for(std::chrono::microseconds(100)); } @@ -274,7 +298,12 @@ struct CachingHostAllocatorImpl { } } +<<<<<<< HEAD virtual bool record_event(void* ptr, void* ctx, S stream) { +======= + virtual bool record_event(void* ptr, void* ctx, c10::Stream s) { + S stream = S(s); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) auto* block = reinterpret_cast(ctx); // Note: we need to check if the passed-in `ctx` is valid. This is because @@ -467,7 +496,11 @@ struct CachingHostAllocatorImpl { virtual B* get_free_block(size_t size) { auto index = size_index(size); std::lock_guard g(free_list_[index].mutex_); +<<<<<<< HEAD if (free_list_[index].list_.size() > 0) { +======= + if (!free_list_[index].list_.empty()) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) B* block = free_list_[index].list_.back(); free_list_[index].list_.pop_back(); block->allocated_ = true; @@ -616,28 +649,78 @@ struct CachingHostAllocatorImpl { alignas(64) std::mutex events_mutex_; std::deque> events_; // event queue paired with block +<<<<<<< HEAD +======= + + // Indicates whether the object is active. + // Set to false in the destructor to signal background threads to stop. + std::atomic active_{true}; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) protected: alignas(64) HostStatsStaged stats_; }; +<<<<<<< HEAD template struct CachingHostAllocatorInterface : public at::Allocator { CachingHostAllocatorInterface() : impl_(std::make_unique()) {} at::DataPtr allocate(size_t size) override { TORCH_CHECK_NOT_IMPLEMENTED(false, "Not implemented for allocate"); +======= +struct TORCH_API HostAllocator : public at::Allocator { + // Associates the pinned memory allocation with a stream to track + // dependencies. This ensures the memory won't be reused until the stream's + // operations complete + virtual bool record_event(void* ptr, void* ctx, c10::Stream stream) = 0; + + // Frees all cached pinned memory and returns it to the system, clearing the + // allocator's internal cache + virtual void empty_cache() = 0; + + // Returns comprehensive statistics about the allocator's memory usage, + // allocation patterns, and timing metrics + virtual HostStats get_stats() = 0; + + // Resets the cumulative allocation statistics + virtual void reset_accumulated_stats() = 0; + + // Resets the peak memory usage metrics + virtual void reset_peak_stats() = 0; +}; + +template +struct CachingHostAllocatorInterface : public HostAllocator { + CachingHostAllocatorInterface() : impl_(std::make_unique()) {} + + at::DataPtr allocate(size_t size) override { + auto ptr_and_ctx = impl_->allocate(size); + return { + ptr_and_ctx.first, + ptr_and_ctx.second, + deleteFunc, // Use the template parameter deleter function + at::DeviceType::CPU}; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } void free(void* ctx) { impl_->free(ctx); } +<<<<<<< HEAD template bool record_event(void* ptr, void* ctx, S stream) { return impl_->record_event(ptr, ctx, stream); } void empty_cache() { +======= + bool record_event(void* ptr, void* ctx, c10::Stream stream) override { + return impl_->record_event(ptr, ctx, stream); + } + + void empty_cache() override { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) impl_->empty_cache(); } @@ -646,6 +729,7 @@ struct CachingHostAllocatorInterface : public at::Allocator { impl_->copy_data(dest, src, count); } +<<<<<<< HEAD HostStats getStats() { return impl_->getStats(); } @@ -655,11 +739,59 @@ struct CachingHostAllocatorInterface : public at::Allocator { } void resetPeakStats() { +======= + HostStats get_stats() override { + return impl_->getStats(); + } + + void reset_accumulated_stats() override { + impl_->resetAccumulatedStats(); + } + + void reset_peak_stats() override { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) impl_->resetPeakStats(); } std::unique_ptr impl_; }; +<<<<<<< HEAD +======= +#define DECLARE_HOST_ALLOCATOR(name, impl, deleter, instance) \ + void deleter(void* ptr); \ + struct name final \ + : public at::CachingHostAllocatorInterface {}; \ + static name instance; \ + void deleter(void* ptr) { \ + instance.free(ptr); \ + } + +/** + * Set the host allocator for DeviceType `device_type`. This allocator manages + * pinned memory on the host that can be accessed efficiently by the specified + * device type. Note that this function is not thread-safe. + */ +TORCH_API void setHostAllocator( + at::DeviceType device_type, + at::HostAllocator* allocator, + uint8_t priority = 0); + +TORCH_API at::HostAllocator* getHostAllocator(at::DeviceType device_type); + +template +struct HostAllocatorRegistry { + explicit HostAllocatorRegistry(HostAllocator* allocator) { + at::setHostAllocator(device_type, allocator); + } +}; + +#define REGISTER_HOST_ALLOCATOR(device_type, allocator) \ + namespace { \ + static at::HostAllocatorRegistry \ + g_host_allocator_registry_instance(allocator); \ + } + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } // namespace at C10_DIAGNOSTIC_POP() diff --git a/aten/src/ATen/core/Dict.h b/aten/src/ATen/core/Dict.h index d187d7b7c116..f288c8fa5da5 100644 --- a/aten/src/ATen/core/Dict.h +++ b/aten/src/ATen/core/Dict.h @@ -116,10 +116,14 @@ class DictIterator final { DictIterator(const DictIterator& rhs): entryRef_(rhs.entryRef_) {} DictIterator(DictIterator&& rhs) noexcept: entryRef_(std::move(rhs.entryRef_)) {} +<<<<<<< HEAD DictIterator& operator=(const DictIterator& rhs) { entryRef_ = rhs.entryRef_; return *this; } +======= + DictIterator& operator=(const DictIterator& rhs) = default; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) DictIterator& operator=(DictIterator&& rhs) noexcept { entryRef_ = std::move(rhs.entryRef_); return *this; diff --git a/aten/src/ATen/core/Dict_inl.h b/aten/src/ATen/core/Dict_inl.h index 6261af5fb66a..3e8e555557eb 100644 --- a/aten/src/ATen/core/Dict_inl.h +++ b/aten/src/ATen/core/Dict_inl.h @@ -53,8 +53,12 @@ inline size_t DictKeyHash::operator()(const IValue& ivalue) const { } else if (ivalue.isDevice()) { return std::hash()(ivalue.toDevice()); } else { +<<<<<<< HEAD throw std::runtime_error( "Can't hash IValues with tag '" + ivalue.tagKind() + "'"); +======= + TORCH_CHECK(false, "Can't hash IValues with tag '", ivalue.tagKind(), "'"); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } } diff --git a/aten/src/ATen/core/Dimname.cpp b/aten/src/ATen/core/Dimname.cpp index 88e368c605cf..f645224fd8a4 100644 --- a/aten/src/ATen/core/Dimname.cpp +++ b/aten/src/ATen/core/Dimname.cpp @@ -25,9 +25,16 @@ bool Dimname::isValidName(const std::string& name) { } for (auto it = name.begin(); it != name.end(); ++it) { // NOLINTNEXTLINE(bugprone-branch-clone) +<<<<<<< HEAD if (std::isalpha(*it) || *it == '_') { continue; } else if (it != name.begin() && std::isdigit(*it)) { +======= + const unsigned char ch = static_cast(*it); + if (std::isalpha(ch) || ch == '_') { + continue; + } else if (it != name.begin() && std::isdigit(ch)) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) continue; } return false; diff --git a/aten/src/ATen/core/Formatting.cpp b/aten/src/ATen/core/Formatting.cpp index 7762e543234a..9b53525535b1 100644 --- a/aten/src/ATen/core/Formatting.cpp +++ b/aten/src/ATen/core/Formatting.cpp @@ -1,5 +1,6 @@ #include #include +<<<<<<< HEAD #include #include @@ -13,6 +14,25 @@ std::ostream& operator<<(std::ostream & out, Backend b) { } std::ostream& operator<<(std::ostream & out, const Scalar& s) { +======= +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +namespace c10 { +std::ostream& operator<<(std::ostream& out, Backend b) { + return out << toString(b); +} + +std::ostream& operator<<(std::ostream& out, const Scalar& s) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if (s.isFloatingPoint()) { return out << s.toDouble(); } @@ -35,6 +55,7 @@ std::ostream& operator<<(std::ostream & out, const Scalar& s) { } std::string toString(const Scalar& s) { +<<<<<<< HEAD std::stringstream out; out << s; return std::move(out).str(); @@ -75,17 +96,56 @@ static std::tuple __printFormat(std::ostream& stream, const Tensor& if(size == 0) { return std::make_tuple(1., 0); } +======= + return fmt::format("{}", fmt::streamed(s)); +} +} // namespace c10 + +namespace at { + +std::ostream& operator<<(std::ostream& out, const DeprecatedTypeProperties& t) { + return out << t.toString(); +} + +enum class FormatType { + Default, // 'g' format (defaultfloat equivalent) + Scientific, // 'e' format with precision 4 + Fixed // 'f' format with precision 4 +}; + +struct PrintFormat { + double scale; + int width; + FormatType type; + + PrintFormat(double s, int w, FormatType t = FormatType::Default) + : scale(s), width(w), type(t) {} +}; + +static PrintFormat __printFormat(const Tensor& self) { + auto size = self.numel(); + if (size == 0) { + return PrintFormat(1., 0); + } + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) bool intMode = true; auto self_p = self.const_data_ptr(); for (const auto i : c10::irange(size)) { auto z = self_p[i]; +<<<<<<< HEAD if(std::isfinite(z)) { if(z != std::ceil(z)) { +======= + if (std::isfinite(z)) { + if (z != std::ceil(z)) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) intMode = false; break; } } } +<<<<<<< HEAD int64_t offset = 0; while(!std::isfinite(self_p[offset])) { offset = offset + 1; @@ -110,16 +170,42 @@ static std::tuple __printFormat(std::ostream& stream, const Tensor& } } if(expMin != 0) { +======= + + int64_t offset = 0; + while (offset < size && !std::isfinite(self_p[offset])) { + offset = offset + 1; + } + + double expMin = 1; + double expMax = 1; + if (offset != size) { + expMin = std::fabs(self_p[offset]); + expMax = std::fabs(self_p[offset]); + for (const auto i : c10::irange(offset, size)) { + double z = std::fabs(self_p[i]); + if (std::isfinite(z)) { + expMin = std::min(expMin, z); + expMax = std::max(expMax, z); + } + } + if (expMin != 0) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) expMin = std::floor(std::log10(expMin)) + 1; } else { expMin = 1; } +<<<<<<< HEAD if(expMax != 0) { +======= + if (expMax != 0) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) expMax = std::floor(std::log10(expMax)) + 1; } else { expMax = 1; } } +<<<<<<< HEAD double scale = 1; int sz = 11; if(intMode) { @@ -144,10 +230,39 @@ static std::tuple __printFormat(std::ostream& stream, const Tensor& stream << std::fixed << std::setprecision(4); } else { if(expMax == 0) { +======= + + double scale = 1; + int sz = 11; + + if (intMode) { + if (expMax > 9) { + sz = 11; + return PrintFormat(scale, sz, FormatType::Scientific); + } else { + sz = static_cast(expMax) + 1; + return PrintFormat(scale, sz, FormatType::Default); + } + } else { + if (expMax - expMin > 4) { + sz = 11; + if (std::fabs(expMax) > 99 || std::fabs(expMin) > 99) { + sz = sz + 1; + } + return PrintFormat(scale, sz, FormatType::Scientific); + } else { + if (expMax > 5 || expMax < 0) { + sz = 7; + scale = std::pow(10, expMax - 1); + return PrintFormat(scale, sz, FormatType::Fixed); + } else { + if (expMax == 0) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) sz = 7; } else { sz = static_cast(expMax) + 6; } +<<<<<<< HEAD stream << std::fixed << std::setprecision(4); } } @@ -176,10 +291,54 @@ static void __printMatrix(std::ostream& stream, const Tensor& self, int64_t line int64_t lastColumn = -1; while(firstColumn < self.size(1)) { if(firstColumn + nColumnPerLine <= self.size(1)) { +======= + return PrintFormat(scale, sz, FormatType::Fixed); + } + } + } +} + +// Precompiled format specs +static constexpr auto FMT_G = FMT_COMPILE("{:>{}g}"); +static constexpr auto FMT_E4 = FMT_COMPILE("{:>{}.4e}"); +static constexpr auto FMT_F4 = FMT_COMPILE("{:>{}.4f}"); + +// Print a single value directly into the stream buffer with no temporaries +static void printValue(std::ostream& stream, double v, const PrintFormat& pf) { + auto out_it = std::ostreambuf_iterator(stream); + double val = v / pf.scale; + switch (pf.type) { + case FormatType::Default: + fmt::format_to(out_it, FMT_G, val, pf.width); + break; + case FormatType::Scientific: + fmt::format_to(out_it, FMT_E4, val, pf.width); + break; + case FormatType::Fixed: + fmt::format_to(out_it, FMT_F4, val, pf.width); + break; + } +} + +static void __printMatrix( + std::ostream& stream, + const Tensor& self, + int64_t linesize, + int64_t indent) { + auto printFmt = __printFormat(self); + + int64_t nColumnPerLine = (linesize - indent) / (printFmt.width + 1); + int64_t firstColumn = 0; + int64_t lastColumn = -1; + + while (firstColumn < self.size(1)) { + if (firstColumn + nColumnPerLine <= self.size(1)) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) lastColumn = firstColumn + nColumnPerLine - 1; } else { lastColumn = self.size(1) - 1; } +<<<<<<< HEAD if(nColumnPerLine < self.size(1)) { if(firstColumn != 0) { stream << '\n'; @@ -208,6 +367,45 @@ static void __printMatrix(std::ostream& stream, const Tensor& self, int64_t line } } else { stream << " "; +======= + + if (nColumnPerLine < self.size(1)) { + if (firstColumn != 0) { + stream.put('\n'); + } + fmt::print( + stream, + "Columns {} to {}{:>{}s}", + firstColumn + 1, + lastColumn + 1, + "", // empty string to pad + indent // width to pad to + ); + } + + if (printFmt.scale != 1) { + fmt::print(stream, "{} *\n{:>{}s}", printFmt.scale, "", indent); + } + + for (const auto l : c10::irange(self.size(0))) { + Tensor row = self.select(0, l); + const double* row_ptr = row.const_data_ptr(); + + for (const auto c : c10::irange(firstColumn, lastColumn + 1)) { + printValue(stream, row_ptr[c], printFmt); + + if (c == lastColumn) { + stream.put('\n'); + if (l != self.size(0) - 1) { + if (printFmt.scale != 1) { + fmt::print(stream, "{:>{}s} ", "", indent); + } else { + fmt::print(stream, "{:>{}s}", "", indent); + } + } + } else { + stream.put(' '); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } } } @@ -215,6 +413,7 @@ static void __printMatrix(std::ostream& stream, const Tensor& self, int64_t line } } +<<<<<<< HEAD static void __printTensor(std::ostream& stream, Tensor& self, int64_t linesize) { std::vector counter(self.ndimension()-2); @@ -229,6 +428,23 @@ static void __printTensor(std::ostream& stream, Tensor& self, int64_t linesize) counter[i] = counter[i] + 1; if(counter[i] >= self.size(i)) { if(i == self.ndimension()-3) { +======= +static void __printTensor( + std::ostream& stream, + Tensor& self, + int64_t linesize) { + std::vector counter(self.ndimension() - 2, 0); + counter[0] = -1; + + bool start = true; + bool finished = false; + + while (true) { + for (int64_t i = 0; self.ndimension() - 2; i++) { + counter[i] = counter[i] + 1; + if (counter[i] >= self.size(i)) { + if (i == self.ndimension() - 3) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) finished = true; break; } @@ -237,6 +453,7 @@ static void __printTensor(std::ostream& stream, Tensor& self, int64_t linesize) break; } } +<<<<<<< HEAD if(finished) { break; } @@ -252,10 +469,29 @@ static void __printTensor(std::ostream& stream, Tensor& self, int64_t linesize) stream << counter[i]+1 << ","; } stream << ".,.) = " << '\n'; +======= + if (finished) { + break; + } + if (start) { + start = false; + } else { + stream.put('\n'); + } + + stream.put('('); + Tensor tensor = self; + for (const auto i : c10::irange(self.ndimension() - 2)) { + tensor = tensor.select(0, counter[i]); + fmt::print(stream, "{},", counter[i] + 1); + } + fmt::print(stream, ".,.) = \n"); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) __printMatrix(stream, tensor, linesize, 1); } } +<<<<<<< HEAD void print(const Tensor & t, int64_t linesize) { print(std::cout,t,linesize); } @@ -342,3 +578,115 @@ std::ostream& print(std::ostream& stream, const Tensor & tensor_, int64_t linesi } } +======= +void print(const Tensor& t, int64_t linesize) { + print(std::cout, t, linesize); +} + +std::ostream& print( + std::ostream& stream, + const Tensor& tensor_, + int64_t linesize) { + if (!tensor_.defined()) { + fmt::print(stream, "[ Tensor (undefined) ]"); + return stream; + } + + if (tensor_.is_sparse()) { + fmt::print(stream, "[ {}{{}}\nindices:\n", tensor_.toString()); + print(stream, tensor_._indices(), linesize); + fmt::print(stream, "\nvalues:\n"); + print(stream, tensor_._values(), linesize); + fmt::print(stream, "\nsize:\n{}\n]", fmt::streamed(tensor_.sizes())); + return stream; + } + + Tensor tensor; + + if (tensor_.is_quantized()) { + tensor = tensor_.dequantize().to(kCPU, kDouble).contiguous(); + } else if (tensor_.is_mkldnn()) { + fmt::print(stream, "MKLDNN Tensor: "); + tensor = tensor_.to_dense().to(kCPU, kDouble).contiguous(); + } else if (tensor_.is_mps()) { + // MPS does not support double tensors, so first copy then convert + tensor = tensor_.to(kCPU).to(kDouble).contiguous(); + } else { + tensor = tensor_.to(kCPU, kDouble).contiguous(); + } + + if (tensor.ndimension() == 0) { + fmt::print( + stream, + "{}\n[ {}{{}}", + tensor.const_data_ptr()[0], + tensor_.toString()); + } else if (tensor.ndimension() == 1) { + if (tensor.numel() > 0) { + auto printFmt = __printFormat(tensor); + if (printFmt.scale != 1) { + fmt::print(stream, "{} *\n", printFmt.scale); + } + const double* tensor_p = tensor.const_data_ptr(); + for (const auto i : c10::irange(tensor.size(0))) { + printValue(stream, tensor_p[i], printFmt); + stream.put('\n'); + } + } + fmt::print(stream, "[ {}{{{}}}", tensor_.toString(), tensor.size(0)); + } else if (tensor.ndimension() == 2) { + if (tensor.numel() > 0) { + __printMatrix(stream, tensor, linesize, 0); + } + fmt::print( + stream, + "[ {}{{{},{}}}", + tensor_.toString(), + tensor.size(0), + tensor.size(1)); + } else { + if (tensor.numel() > 0) { + __printTensor(stream, tensor, linesize); + } + fmt::print(stream, "[ {}{{{}", tensor_.toString(), tensor.size(0)); + for (const auto i : c10::irange(1, tensor.ndimension())) { + fmt::print(stream, ",{}", tensor.size(i)); + } + fmt::print(stream, "}}"); + } + + // Add quantization info + if (tensor_.is_quantized()) { + fmt::print(stream, ", qscheme: {}", toString(tensor_.qscheme())); + if (tensor_.qscheme() == c10::kPerTensorAffine) { + fmt::print( + stream, + ", scale: {}, zero_point: {}", + tensor_.q_scale(), + tensor_.q_zero_point()); + } else if ( + tensor_.qscheme() == c10::kPerChannelAffine || + tensor_.qscheme() == c10::kPerChannelAffineFloatQParams) { + fmt::print(stream, ", scales: "); + print(stream, tensor_.q_per_channel_scales(), linesize); + fmt::print(stream, ", zero_points: "); + print(stream, tensor_.q_per_channel_zero_points(), linesize); + fmt::print(stream, ", axis: {}", tensor_.q_per_channel_axis()); + } + } + + // Proxy check for if autograd was built + if (tensor.getIntrusivePtr()->autograd_meta()) { + auto& fw_grad = tensor._fw_grad(/* level */ 0); + if (fw_grad.defined()) { + fmt::print(stream, ", tangent:\n"); + print(stream, fw_grad, linesize); + } + } + + fmt::print(stream, " ]"); + return stream; +} + +} // namespace at +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) diff --git a/aten/src/ATen/core/IListRef_inl.h b/aten/src/ATen/core/IListRef_inl.h index a21bd22cf16c..01220d3c6b2b 100644 --- a/aten/src/ATen/core/IListRef_inl.h +++ b/aten/src/ATen/core/IListRef_inl.h @@ -168,7 +168,13 @@ class IListRefTagImpl */ static IListRefConstRef iterator_get( const typename list_type::const_iterator& it) { +<<<<<<< HEAD const auto& ivalue = (*it).get(); +======= + C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wdangling-reference") + const auto& ivalue = (*it).get(); + C10_DIAGNOSTIC_POP() +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if (!ivalue.isNone()) { const auto& tensor = ivalue.toTensor(); return (tensor.defined()) ? tensor : at::OptionalTensorRef{}; diff --git a/aten/src/ATen/core/IListRef_test.cpp b/aten/src/ATen/core/IListRef_test.cpp index 505a80216d67..502c669ee59e 100644 --- a/aten/src/ATen/core/IListRef_test.cpp +++ b/aten/src/ATen/core/IListRef_test.cpp @@ -42,7 +42,11 @@ static std::vector get_unboxed_opt_tensor_vector() { } template +<<<<<<< HEAD void check_elements_same(at::ITensorListRef list, const T& thing, int use_count) { +======= +static void check_elements_same(at::ITensorListRef list, const T& thing, int use_count) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) EXPECT_EQ(thing.size(), list.size()); size_t i = 0; for (const auto& t : list) { diff --git a/aten/src/ATen/core/List_test.cpp b/aten/src/ATen/core/List_test.cpp index 71029598aab2..41bf5d4d46be 100644 --- a/aten/src/ATen/core/List_test.cpp +++ b/aten/src/ATen/core/List_test.cpp @@ -2,6 +2,10 @@ #include using namespace c10; +<<<<<<< HEAD +======= +using std::string; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // NOLINTBEGIN(performance-move-const-arg, bugprone-use-after-move, *analyzer*Move) TEST(ListTestIValueBasedList, givenEmptyList_whenCallingEmpty_thenReturnsTrue) { diff --git a/aten/src/ATen/core/NamedTensor.cpp b/aten/src/ATen/core/NamedTensor.cpp index b1126e212265..547d951adff3 100644 --- a/aten/src/ATen/core/NamedTensor.cpp +++ b/aten/src/ATen/core/NamedTensor.cpp @@ -5,7 +5,11 @@ namespace at { +<<<<<<< HEAD thread_local bool NamesMode_enabled = true; +======= +thread_local static bool NamesMode_enabled = true; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) bool NamesMode::is_enabled() { return NamesMode_enabled; diff --git a/aten/src/ATen/core/Tensor.cpp b/aten/src/ATen/core/Tensor.cpp index 43474515db0f..bd23edf7cbc2 100644 --- a/aten/src/ATen/core/Tensor.cpp +++ b/aten/src/ATen/core/Tensor.cpp @@ -51,9 +51,14 @@ TensorBase TensorBase::to( } void TensorBase::enforce_invariants() { +<<<<<<< HEAD if (impl_.get() == nullptr) { throw std::runtime_error("TensorImpl with nullptr is not supported"); } +======= + TORCH_CHECK( + impl_.get() != nullptr, "TensorImpl with nullptr is not supported"); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // Following line throws if the method is not a POD data type or is not // supported by ATen scalar_type(); diff --git a/aten/src/ATen/core/TensorBase.h b/aten/src/ATen/core/TensorBase.h index 8d300debebe3..9722cbb06526 100644 --- a/aten/src/ATen/core/TensorBase.h +++ b/aten/src/ATen/core/TensorBase.h @@ -57,16 +57,27 @@ inline bool variable_excluded_from_dispatch() { // NOTE: [Tensor vs. TensorBase] // // Tensor, being the central data structure in PyTorch, gets used and +<<<<<<< HEAD // it's header included almost everywhere. Unfortunately this means // every time an operator signature is updated or changed in // native_functions.yaml, you (and every other PyTorch developer) need // to recompile all of ATen and it's dependencies. +======= +// its header included almost everywhere. Unfortunately this means +// every time an operator signature is updated or changed in +// native_functions.yaml, you (and every other PyTorch developer) need +// to recompile all of ATen and its dependencies. +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // // TensorBase aims to break up these header dependencies, and improve // incremental build times for all PyTorch developers. TensorBase // represents a reference counted handle to TensorImpl, exactly the // same as Tensor. However, TensorBase doesn't have code generated +<<<<<<< HEAD // methods in it's API and thus no dependence on native_functions.yaml. +======= +// methods in its API and thus no dependence on native_functions.yaml. +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // // Usage tips // ---------- @@ -75,9 +86,15 @@ inline bool variable_excluded_from_dispatch() { // native_functions.yaml (direct or indirect). // - Tensor inherits from TensorBase, so functions taking // `const TensorBase &` are callable with Tensor as well. +<<<<<<< HEAD // - TensorBase can be converted to tensor with `Tensor(tensor_base)`, // but this requires a reference-count bump. OptionalTensorRef on // the other hand can materialize a `const Tensor &` without +======= +// - TensorBase can be converted to Tensor with `Tensor(tensor_base)`, +// but this requires a reference-count bump. OptionalTensorRef, on +// the other hand, can materialize a `const Tensor &` without +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // touching the reference-count. class TORCH_API TensorBase { public: diff --git a/aten/src/ATen/core/VariableFallbackKernel.cpp b/aten/src/ATen/core/VariableFallbackKernel.cpp index 390d9189190e..a8fd023320e5 100644 --- a/aten/src/ATen/core/VariableFallbackKernel.cpp +++ b/aten/src/ATen/core/VariableFallbackKernel.cpp @@ -80,6 +80,13 @@ TORCH_LIBRARY_IMPL(_, AutogradMTIA, m) { m.fallback(AUTOGRAD_FALLBACK); } +<<<<<<< HEAD +======= +TORCH_LIBRARY_IMPL(_, AutogradMAIA, m) { + m.fallback(AUTOGRAD_FALLBACK); +} + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) TORCH_LIBRARY_IMPL(_, AutogradXLA, m) { m.fallback(AUTOGRAD_FALLBACK); } diff --git a/aten/src/ATen/core/Vitals.cpp b/aten/src/ATen/core/Vitals.cpp index 13b8eda63859..01e3720b2e44 100644 --- a/aten/src/ATen/core/Vitals.cpp +++ b/aten/src/ATen/core/Vitals.cpp @@ -1,4 +1,8 @@ #include +<<<<<<< HEAD +======= +#include +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #include #include @@ -41,9 +45,15 @@ bool torchVitalEnabled() { // If this is a performance hit, make `enabled` variable static // and return `const bool&` instead bool enabled = []() { +<<<<<<< HEAD auto e = getenv("TORCH_VITAL"); if (e != nullptr) { return e[0] != '\0'; +======= + auto const e = c10::utils::get_env("TORCH_VITAL"); + if (e.has_value()) { + return !e.value().empty(); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } return false; }(); diff --git a/aten/src/ATen/core/adaption.cpp b/aten/src/ATen/core/adaption.cpp index ef06b9606ba7..a16f2099ad66 100644 --- a/aten/src/ATen/core/adaption.cpp +++ b/aten/src/ATen/core/adaption.cpp @@ -5,9 +5,14 @@ namespace c10::impl { void common_device_check_failure(Device common_device, const at::Tensor& tensor, at::CheckedFrom methodName, at::CheckedFrom argName) { TORCH_CHECK(false, +<<<<<<< HEAD "Expected all tensors to be on the same device, but " "found at least two devices, ", common_device, " and ", tensor.device(), "! " "(when checking argument for argument ", argName, " in method ", methodName, ")"); +======= + "Expected all tensors to be on the same device, but got ", argName, " is on ", tensor.device(), + ", different from other tensors on ", common_device, " (when checking argument in method ", methodName, ")"); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } } // namespace c10::impl diff --git a/aten/src/ATen/core/alias_info.h b/aten/src/ATen/core/alias_info.h index a8a55bb782c4..6b029cee77d0 100644 --- a/aten/src/ATen/core/alias_info.h +++ b/aten/src/ATen/core/alias_info.h @@ -1,4 +1,9 @@ #pragma once +<<<<<<< HEAD +======= +#include +#include +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #include #include #include @@ -18,6 +23,18 @@ namespace c10 { */ class AliasInfo { public: +<<<<<<< HEAD +======= + AliasInfo() = default; + AliasInfo(bool is_write, const std::set& before_qual_strings, const std::set& after_qual_strings) : isWrite_(is_write) { + for (const auto& s: before_qual_strings) { + beforeSets_.insert(Symbol::fromQualString(s)); + } + for (const auto& s : after_qual_strings) { + afterSets_.insert(Symbol::fromQualString(s)); + } + } +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // Symbol for the set that can alias anything static Symbol wildcardSet() { static const Symbol wc = Symbol::fromQualString("alias::*"); diff --git a/aten/src/ATen/core/boxing/KernelFunction.cpp b/aten/src/ATen/core/boxing/KernelFunction.cpp index b13f827b8f17..8ec24e1c473e 100644 --- a/aten/src/ATen/core/boxing/KernelFunction.cpp +++ b/aten/src/ATen/core/boxing/KernelFunction.cpp @@ -28,7 +28,11 @@ void ambiguous_autogradother_kernel(OperatorKernel*, const OperatorHandle& op, D "Autograd dispatch key for the backend.\n", "If you only want to run inference instead of training, in C++, add `c10::InferenceMode mode;` " "before model.forward(); in Python, use `torch.inference_mode()` as a context manager (see " +<<<<<<< HEAD "https://pytorch.org/docs/stable/generated/torch.inference_mode.html).", +======= + "https://pytorch.org/docs/stable/generated/torch.autograd.grad_mode.inference_mode.html).", +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) "\nCanonical state\n~~~~~~~~~~~\n", op.dumpState(), "\n\n"); } diff --git a/aten/src/ATen/core/boxing/impl/kernel_function_legacy_test.cpp b/aten/src/ATen/core/boxing/impl/kernel_function_legacy_test.cpp index b25c4e543a40..f804aac737a7 100644 --- a/aten/src/ATen/core/boxing/impl/kernel_function_legacy_test.cpp +++ b/aten/src/ATen/core/boxing/impl/kernel_function_legacy_test.cpp @@ -519,7 +519,11 @@ TEST(OperatorRegistrationTestLegacyFunctionBasedKernel, givenKernelWithDictInput EXPECT_EQ(2, captured_dict_size); } +<<<<<<< HEAD string kernelWithDictInputWithOutput(Dict input1) { +======= +std::string kernelWithDictInputWithOutput(Dict input1) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return input1.at("key2"); } @@ -581,7 +585,11 @@ TEST(OperatorRegistrationTestLegacyFunctionBasedKernel, givenKernelWithUnordered EXPECT_EQ(2, captured_dict_size); } +<<<<<<< HEAD string kernelWithUnorderedMapInputWithOutput(std::unordered_map input1) { +======= +std::string kernelWithUnorderedMapInputWithOutput(std::unordered_map input1) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return input1.at("key2"); } diff --git a/aten/src/ATen/core/boxing/impl/kernel_function_test.cpp b/aten/src/ATen/core/boxing/impl/kernel_function_test.cpp index 4ebb18d82f96..a1ecea263ab1 100644 --- a/aten/src/ATen/core/boxing/impl/kernel_function_test.cpp +++ b/aten/src/ATen/core/boxing/impl/kernel_function_test.cpp @@ -468,7 +468,11 @@ TEST(OperatorRegistrationTestFunctionBasedKernel, givenKernelWithDictInput_witho EXPECT_EQ(2, captured_dict_size); } +<<<<<<< HEAD string kernelWithDictInputWithOutput(Dict input1) { +======= +std::string kernelWithDictInputWithOutput(Dict input1) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return input1.at("key2"); } diff --git a/aten/src/ATen/core/boxing/impl/make_boxed_from_unboxed_functor_test.cpp b/aten/src/ATen/core/boxing/impl/make_boxed_from_unboxed_functor_test.cpp index 42c06c38ac34..866bbda2c92b 100644 --- a/aten/src/ATen/core/boxing/impl/make_boxed_from_unboxed_functor_test.cpp +++ b/aten/src/ATen/core/boxing/impl/make_boxed_from_unboxed_functor_test.cpp @@ -463,7 +463,11 @@ TEST(OperatorRegistrationTestFunctorBasedKernel, givenKernelWithDictInput_withou } struct KernelWithDictInputWithOutput final : OperatorKernel { +<<<<<<< HEAD string operator()(Dict input1) { +======= +std::string operator()(Dict input1) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return input1.at("key2"); } }; @@ -475,7 +479,11 @@ TEST(OperatorRegistrationTestFunctorBasedKernel, givenKernelWithDictInput_withOu auto op = c10::Dispatcher::singleton().findSchema({"_test::dict_input", ""}); ASSERT_TRUE(op.has_value()); +<<<<<<< HEAD Dict dict; +======= + Dict dict; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) dict.insert("key1", "value1"); dict.insert("key2", "value2"); auto outputs = callOp(*op, dict); @@ -484,7 +492,11 @@ TEST(OperatorRegistrationTestFunctorBasedKernel, givenKernelWithDictInput_withOu } struct KernelWithDictOutput final : OperatorKernel { +<<<<<<< HEAD Dict operator()(Dict input) { +======= + Dict operator()(Dict input) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return input; } }; @@ -496,12 +508,20 @@ TEST(OperatorRegistrationTestFunctorBasedKernel, givenKernelWithDictOutput_whenR auto op = c10::Dispatcher::singleton().findSchema({"_test::dict_output", ""}); ASSERT_TRUE(op.has_value()); +<<<<<<< HEAD Dict dict; +======= + Dict dict; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) dict.insert("key1", "value1"); dict.insert("key2", "value2"); auto outputs = callOp(*op, dict); EXPECT_EQ(1, outputs.size()); +<<<<<<< HEAD auto output = c10::impl::toTypedDict(outputs[0].toGenericDict()); +======= + auto output = c10::impl::toTypedDict(outputs[0].toGenericDict()); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) EXPECT_EQ(2, output.size()); EXPECT_EQ("value1", output.at("key1")); @@ -520,7 +540,11 @@ class KernelWithCache final : public OperatorKernel { }; struct KernelWithTupleInput final : OperatorKernel { +<<<<<<< HEAD string operator()(std::tuple input1) { +======= + std::string operator()(std::tuple input1) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return std::get<0>(input1); } }; diff --git a/aten/src/ATen/core/dispatch/DispatchKeyExtractor.h b/aten/src/ATen/core/dispatch/DispatchKeyExtractor.h index 27438b926db5..5ae5d88e0eed 100644 --- a/aten/src/ATen/core/dispatch/DispatchKeyExtractor.h +++ b/aten/src/ATen/core/dispatch/DispatchKeyExtractor.h @@ -152,8 +152,16 @@ struct TORCH_API DispatchKeyExtractor final { // no safe toTensorRef method, alas) ks = ks | ivalue.unsafeToTensorImpl()->key_set(); } else if (C10_UNLIKELY(ivalue.isTensorList())) { +<<<<<<< HEAD for (const at::Tensor& tensor : ivalue.toTensorList()) { ks = ks | tensor.key_set(); +======= + // NB: use toListRef as it doesn't induce refcount bumps + // (toTensorListRef is not a thing) + for (const auto& nv : ivalue.toListRef()) { + auto* tensor = nv.unsafeToTensorImpl(); + ks = ks | tensor->key_set(); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } } // Tensor?[] translates to a c10::List so we need to peek inside @@ -200,6 +208,34 @@ struct TORCH_API DispatchKeyExtractor final { void checkInvariants(const FunctionSchema& schema) const; private: +<<<<<<< HEAD +======= + static bool isDispatchType(const Type& type) { + // Checking isSubtypeOf on a DynamicType heap-allocates a + // DynamicType version of the argument if it's not a DynamicType + // already, and this has measurable overhead during startup. +#ifdef C10_MOBILE + struct CachedTypes { + DynamicTypePtr listOfTensors; + DynamicTypePtr listOfOptionalTensors; + DynamicTypePtr optionalOfTensor; + }; + static const CachedTypes ct = { + DynamicType::create(*ListType::ofTensors()), + DynamicType::create(*ListType::ofOptionalTensors()), + DynamicType::create(*OptionalType::ofTensor())}; + return type.isSubtypeOf(c10::TypeFactory::get()) || + type.isSubtypeOf(ct.listOfTensors) || + type.isSubtypeOf(ct.listOfOptionalTensors) || + type.isSubtypeOf(ct.optionalOfTensor); +#else // C10_MOBILE + return type.isSubtypeOf(*TensorType::get()) || + type.isSubtypeOf(*ListType::ofTensors()) || + type.isSubtypeOf(*ListType::ofOptionalTensors()) || + type.isSubtypeOf(*OptionalType::ofTensor()); +#endif // C10_MOBILE + } +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) static c10::utils::bitset makeBitsetForDispatchArgs( const FunctionSchema& schema) { TORCH_CHECK( @@ -210,6 +246,7 @@ struct TORCH_API DispatchKeyExtractor final { c10::utils::bitset::NUM_BITS()); c10::utils::bitset dispatch_arg_indices_reverse; for (const auto index : c10::irange(schema.arguments().size())) { +<<<<<<< HEAD if (schema.arguments()[index].type()->isSubtypeOf(*TensorType::get()) || schema.arguments()[index].type()->isSubtypeOf( *ListType::ofTensors()) || @@ -217,6 +254,9 @@ struct TORCH_API DispatchKeyExtractor final { *ListType::ofOptionalTensors()) || schema.arguments()[index].type()->isSubtypeOf( *OptionalType::ofTensor())) { +======= + if (isDispatchType(*schema.arguments()[index].type())) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) dispatch_arg_indices_reverse.set(schema.arguments().size() - 1 - index); } } @@ -225,8 +265,12 @@ struct TORCH_API DispatchKeyExtractor final { explicit DispatchKeyExtractor(c10::utils::bitset dispatch_arg_indices_reverse) : dispatch_arg_indices_reverse_(dispatch_arg_indices_reverse), +<<<<<<< HEAD nonFallthroughKeys_(DispatchKeySet::FULL), requiresBitsetPerBackend_(false) { +======= + nonFallthroughKeys_(DispatchKeySet::FULL) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) for (const auto i : c10::irange(nonFallthroughKeysPerBackend_.size())) { nonFallthroughKeysPerBackend_[i] = DispatchKeySet::FULL; } @@ -252,7 +296,11 @@ struct TORCH_API DispatchKeyExtractor final { // Flag to tell us if we can use the single set of nonFallthroughKeys_ (fast // path), or if we need to fall back to the slower path and check // nonFallthroughKeysPerBackend_ +<<<<<<< HEAD bool requiresBitsetPerBackend_; +======= + bool requiresBitsetPerBackend_{false}; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) }; } // namespace c10 diff --git a/aten/src/ATen/core/dispatch/Dispatcher.cpp b/aten/src/ATen/core/dispatch/Dispatcher.cpp index 7ff4901a16b0..e9d0cb88e95e 100644 --- a/aten/src/ATen/core/dispatch/Dispatcher.cpp +++ b/aten/src/ATen/core/dispatch/Dispatcher.cpp @@ -1,9 +1,15 @@ #include #include +<<<<<<< HEAD #include #include #include #include +======= +#include +#include +#include +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #ifdef FBCODE_CAFFE2 #include @@ -17,6 +23,7 @@ TORCH_SDT_DEFINE_SEMAPHORE(operator_end) #endif bool show_dispatch_trace() { +<<<<<<< HEAD static auto envar = std::getenv("TORCH_SHOW_DISPATCH_TRACE"); if (envar) { @@ -24,6 +31,15 @@ bool show_dispatch_trace() { return false; } if (strcmp(envar, "1") == 0) { +======= + static auto envar = c10::utils::get_env("TORCH_SHOW_DISPATCH_TRACE"); + + if (envar.has_value()) { + if (envar == "0") { + return false; + } + if (envar == "1") { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return true; } TORCH_WARN( @@ -180,6 +196,21 @@ const std::vector Dispatcher::getAllOpNames() { }); } +<<<<<<< HEAD +======= +const std::vector Dispatcher::getAllOpNamesForDispatchKey(DispatchKey k) { + return operatorLookupTable_.read([&] (const ska::flat_hash_map& operatorLookupTable) -> std::vector { + std::vector allOpNames; + for (const auto& op : operatorLookupTable) { + if (op.second.hasKernelForDispatchKey(k)) { + allOpNames.push_back(op.first); + } + } + return allOpNames; + }); +} + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // Postcondition: caller is responsible for disposing of registration when they // are done OperatorHandle Dispatcher::findOrRegisterName_(const OperatorName& op_name) { diff --git a/aten/src/ATen/core/dispatch/Dispatcher.h b/aten/src/ATen/core/dispatch/Dispatcher.h index dbc501afe7ce..9407250dffa4 100644 --- a/aten/src/ATen/core/dispatch/Dispatcher.h +++ b/aten/src/ATen/core/dispatch/Dispatcher.h @@ -165,6 +165,13 @@ class TORCH_API Dispatcher final { // Returns a list of all operator names present in the operatorLookupTable_ const std::vector getAllOpNames(); +<<<<<<< HEAD +======= + // Returns a list of all operator names present in the operatorLookupTable_ + // for a given dispatch key + const std::vector getAllOpNamesForDispatchKey(DispatchKey k); + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // ------------------------------------------------------------------------ // // Invoking operators diff --git a/aten/src/ATen/core/dispatch/OperatorEntry.cpp b/aten/src/ATen/core/dispatch/OperatorEntry.cpp index 751577df6f2d..b197a3da0ef1 100644 --- a/aten/src/ATen/core/dispatch/OperatorEntry.cpp +++ b/aten/src/ATen/core/dispatch/OperatorEntry.cpp @@ -2,6 +2,14 @@ #include #include #include +<<<<<<< HEAD +======= +#include + +#include +#include +#include +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) namespace c10::impl { @@ -17,6 +25,48 @@ namespace { #endif } +<<<<<<< HEAD +======= +static const std::vector& allDispatchKeysInFullSet() { + static const auto result = []() { + std::vector vec; + for (const auto dispatch_key: DispatchKeySet(DispatchKeySet::FULL)) { + vec.push_back(dispatch_key); + } + return vec; + }(); + return result; +} + +// Returns an array of the same size as the dispatch table, where each +// entry is the DispatchKey that the corresponding index in the +// dispatch table represents. +static const auto& getDispatchTableIndexToKey() { + static const auto result = []() { + using result_type = std::array; + result_type arr; + arr.fill(DispatchKey::Undefined); + for (const auto dispatch_key: allDispatchKeysInFullSet()) { + const auto index = getDispatchTableIndexForDispatchKey(dispatch_key); + TORCH_INTERNAL_ASSERT(arr.at(index) == DispatchKey::Undefined); + arr.at(index) = dispatch_key; + } + // Self-test. Should be plenty cheap enough to just run in prod + // builds. We just need to make sure that we have the dispatch key + // for every entry in the table, and we assert in + // update_array_entry above that we also don't have any conflicts + // during computation. + TORCH_INTERNAL_ASSERT(getDispatchTableIndexForDispatchKey(DispatchKey::Undefined) == 0); + TORCH_INTERNAL_ASSERT(arr[0] == DispatchKey::Undefined); + for (const auto index : c10::irange(1, arr.size())) { + TORCH_INTERNAL_ASSERT(arr[index] != DispatchKey::Undefined, "missing dispatch key at index ", index); + } + return arr; + }(); + return result; +} + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) OperatorEntry::OperatorEntry(OperatorName&& operator_name) : name_(std::move(operator_name)) , schema_() @@ -31,8 +81,32 @@ OperatorEntry::OperatorEntry(OperatorName&& operator_name) , is_observed_(ObservedOperators::isObserved(name_)) { // Pick up any backend fallbacks that were registered prior to this +<<<<<<< HEAD // OperatorEntry being created updateDispatchTableFull_(c10::Dispatcher::singleton()); +======= + // OperatorEntry being created. + + // We are essentially directly implementing + // updateDispatchTableFull_, taking into account that we know + // kernels_ is empty() and therefore + // computeDispatchTableEntryWithDebug cases 1 and 2.1 through 2.5 + // won't do anything. + const auto& dispatcher = c10::Dispatcher::singleton(); + const auto& dispatch_table_index_to_key = getDispatchTableIndexToKey(); + for (const auto dispatch_ix: c10::irange(dispatcher.backendFallbackKernels_.size())) { + const auto& bfk = dispatcher.backendFallbackKernels_[dispatch_ix]; + if (bfk.kernel.isValid()) { + dispatchTable_[dispatch_ix] = bfk.kernel; + if (bfk.kernel.isFallthrough()) { + TORCH_INTERNAL_ASSERT_DEBUG_ONLY(dispatch_ix < dispatch_table_index_to_key.size()); + dispatchKeyExtractor_.setOperatorHasFallthroughForKey(dispatch_table_index_to_key[dispatch_ix], true); + } + } else { + dispatchTable_[dispatch_ix] = missingKernel().kernel; + } + } +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } namespace { @@ -150,7 +224,12 @@ OperatorEntry::AnnotatedKernelContainerIterator OperatorEntry::registerKernel( #endif // Suppress the warning for Meta key as we are overriding C++ meta functions with python meta functions // for some ops +<<<<<<< HEAD if (dispatch_key != DispatchKey::Meta) { +======= + // Also suppress the warning for MTIA, as MTIA achieves CPU fallback by overriding registration. + if (dispatch_key != DispatchKey::Meta && dispatch_key != DispatchKey::MTIA) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) TORCH_WARN_ONCE("Warning only once for all operators, other operators may also be overridden.\n", " Overriding a previously registered kernel for the same operator and the same dispatch key\n", " operator: ", (schema_.has_value() ? toString(schema_->schema) : toString(name_)), "\n", @@ -290,7 +369,11 @@ std::pair OperatorEntry::computeDispatchTab // CompositExplicitAutogradNonFunctional > CompositeExplicitAutograd > CompositeImplicitAutograd > Autograd // Note [CompositeExplicitAutograd and CompositeImplicitAutograd] // When there're registrations to both CompositeExplicitAutograd & CompositeImplicitAutograd & Autograd, from (2.2) we know CompositeExplicitAutograd +<<<<<<< HEAD // and Autograd kernels will be picked up and CompositeImplicitAutograd is overriden. +======= + // and Autograd kernels will be picked up and CompositeImplicitAutograd is overridden. +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // This is fine and in practice CompositeExplicitAutograd and CompositeImplicitAutograd shouldn't co-exist for an op. // TODO: Update alias key precedence after we add new alias keys AutogradDispatchCPUOrCUDA . @@ -452,7 +535,11 @@ void OperatorEntry::updateDispatchTableFull_(const c10::Dispatcher& dispatcher) // or CompositeImplicitAutograd alias key so that we don't break the support. Ideally isIncludedInAlias(Undefined, CompositeImplicitAutograd) // should return true, it returns false because Undefined cannot be represented in a DispatchKeySet. updateDispatchTable_(dispatcher, DispatchKey::Undefined); +<<<<<<< HEAD for (auto k : DispatchKeySet(DispatchKeySet::FULL)) { +======= + for (auto k : allDispatchKeysInFullSet()) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) updateDispatchTable_(dispatcher, k); } } @@ -466,7 +553,11 @@ void OperatorEntry::checkInvariants() const { for (const auto& kv : kernels_) { TORCH_INTERNAL_ASSERT(!kv.second.empty(), dumpState()); } +<<<<<<< HEAD for (auto k : DispatchKeySet(DispatchKeySet::FULL)) { +======= + for (auto k : allDispatchKeysInFullSet()) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) auto expected_k = computeDispatchTableEntry(c10::Dispatcher::singleton(), k); auto idx = getDispatchTableIndexForDispatchKey(k); if (C10_UNLIKELY(idx == -1)) { @@ -483,7 +574,11 @@ std::string OperatorEntry::listAllDispatchKeys() const { str << "["; bool has_kernels = false; +<<<<<<< HEAD for (auto k : DispatchKeySet(DispatchKeySet::FULL)) { +======= + for (auto k : allDispatchKeysInFullSet()) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) auto iter = getDispatchTableIndexForDispatchKey(k); if (iter == -1 || !dispatchTable_[iter].isValid()) { continue; @@ -569,7 +664,11 @@ std::string OperatorEntry::dumpComputedTable() const { // Need to handle Undefined separately, because its a runtime key that can't be represented // in a DispatchKeySet. std::vector runtime_keys = {DispatchKey::Undefined}; +<<<<<<< HEAD for (auto k : DispatchKeySet(DispatchKeySet::FULL)) runtime_keys.push_back(k); +======= + for (auto k : allDispatchKeysInFullSet()) runtime_keys.push_back(k); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) for (auto k : runtime_keys) { auto kernel_prov = computeDispatchTableEntryWithDebug(c10::Dispatcher::singleton(), k); diff --git a/aten/src/ATen/core/dynamic_type.cpp b/aten/src/ATen/core/dynamic_type.cpp index 543c6f830f40..fdbf394e5372 100644 --- a/aten/src/ATen/core/dynamic_type.cpp +++ b/aten/src/ATen/core/dynamic_type.cpp @@ -78,15 +78,23 @@ DynamicType::~DynamicType() { arguments_.~Arguments(); } +<<<<<<< HEAD std::shared_ptr DynamicType::create(const Type& other) { +======= +SingletonOrSharedTypePtr DynamicType::create(const Type& other) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if (auto dynRaw = other.castRaw()) { TORCH_INTERNAL_ASSERT( !dynRaw->weak_from_this().expired(), "Error creating dynamic type instance not managed by shared_ptr: ", other.str()); +<<<<<<< HEAD } if (auto dyn = other.cast()) { return dyn; +======= + return SingletonTypePtr(dynRaw); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } return std::shared_ptr(new DynamicType{other}); } diff --git a/aten/src/ATen/core/dynamic_type.h b/aten/src/ATen/core/dynamic_type.h index 2e7b7cbc5d31..2d095dc1a81d 100644 --- a/aten/src/ATen/core/dynamic_type.h +++ b/aten/src/ATen/core/dynamic_type.h @@ -187,7 +187,13 @@ class DynamicType : public SharedType { return false; } friend struct Type; +<<<<<<< HEAD static std::shared_ptr create(const Type& ty); +======= + // NOTE: Here we are using SingletonOrSharedTypePtr to mean + // "original-type-because-it-was-actually-a-DynamicType or shared". + static SingletonOrSharedTypePtr create(const Type& ty); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) DynamicType(const Type& other); bool equals(const DynamicType& other) const; diff --git a/aten/src/ATen/core/function_schema.cpp b/aten/src/ATen/core/function_schema.cpp index 9478b11ee108..def0d11a2d89 100644 --- a/aten/src/ATen/core/function_schema.cpp +++ b/aten/src/ATen/core/function_schema.cpp @@ -41,9 +41,21 @@ FunctionSchema FunctionSchema::cloneWithRealTypes(bool with_symint) const { } }; std::vector new_arguments, new_returns; +<<<<<<< HEAD std::transform(arguments().begin(), arguments().end(), std::back_inserter(new_arguments), cloneWithRealTypes); // NB: SymInt returns are always SymInt std::transform(returns().begin(), returns().end(), std::back_inserter(new_returns), alwaysCloneWithRealTypes); +======= + new_arguments.reserve(arguments().size()); + for (const auto& arg: arguments()) { + new_arguments.push_back(cloneWithRealTypes(arg)); + } + // NB: SymInt returns are always SymInt + new_returns.reserve(returns().size()); + for (const auto& ret: returns()) { + new_returns.push_back(alwaysCloneWithRealTypes(ret)); + } +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return FunctionSchema( name(), overload_name(), diff --git a/aten/src/ATen/core/function_schema_inl.h b/aten/src/ATen/core/function_schema_inl.h index f4d5ee6a3fd3..b79475f72ea5 100644 --- a/aten/src/ATen/core/function_schema_inl.h +++ b/aten/src/ATen/core/function_schema_inl.h @@ -71,7 +71,11 @@ inline void FunctionSchema::checkAndNormalizeInputs( for(const auto& k : kwargs) { names.emplace_back(k.first); } +<<<<<<< HEAD throw std::runtime_error(findErrorInKwargs(names)); +======= + TORCH_CHECK(false, findErrorInKwargs(names)); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } } diff --git a/aten/src/ATen/core/library.cpp b/aten/src/ATen/core/library.cpp index b8a5b418bbc0..5a9b885900fd 100644 --- a/aten/src/ATen/core/library.cpp +++ b/aten/src/ATen/core/library.cpp @@ -1,6 +1,10 @@ #include #include +<<<<<<< HEAD +======= +#include +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) namespace torch { @@ -11,7 +15,11 @@ namespace { #ifdef STRIP_ERROR_MESSAGES return std::string(); #else +<<<<<<< HEAD return c10::str("registered at ", file, ":", line); +======= + return fmt::format("registered at {}:{}", file, line); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #endif } @@ -58,6 +66,29 @@ void Library::reset() { #define ERROR_CONTEXT "(Error occurred while processing ", toString(kind_), " block at ", file_, ":", line_, ")" +<<<<<<< HEAD +======= +#if defined(TORCH_LIBRARY_THREAD_UNSAFE_LAZY_INIT) && defined(C10_MOBILE) +namespace detail { + // Insertion of library initializers into torch_library_initializers is not + // thread-safe as we expect this to be handled by the applications dynamic + // library loader, which would guarantee that only one thread is inserting + // libraries into the vector. We do require thread safety when calling + // initialize_torch_libraries however, as this can be called from any + // thread, and potentially race and corrupt the library initializer vector. + std::mutex torch_library_initializer_mutex; + std::vector torch_library_initializers; +} // namespace detail +void initialize_torch_libraries() { + const std::lock_guard lock(detail::torch_library_initializer_mutex); + for (auto* initializer : detail::torch_library_initializers) { + initializer->initialize(); + } + detail::torch_library_initializers.clear(); +} +#endif + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) Library::Library(Kind kind, std::string ns, std::optional k, const char* file, uint32_t line) : kind_(kind) , ns_(ns == "_" ? std::nullopt : std::make_optional(std::move(ns))) diff --git a/aten/src/ATen/core/op_registration/op_registration_test.cpp b/aten/src/ATen/core/op_registration/op_registration_test.cpp index 0ffc061870f1..415aaf18bcdd 100644 --- a/aten/src/ATen/core/op_registration/op_registration_test.cpp +++ b/aten/src/ATen/core/op_registration/op_registration_test.cpp @@ -1787,8 +1787,12 @@ TEST(NewOperatorRegistrationTest, dispatchAutogradPrecedence) { } TEST(NewOperatorRegistrationTest, throwsWhenRegisterToBackendMapsToAutogradOther) { +<<<<<<< HEAD // NOLINTNEXTLINE(cppcoreguidelines-init-variables) bool fpga_called, math_called = false; +======= + bool fpga_called = false, math_called = false; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) auto m = MAKE_TORCH_LIBRARY(test); m.def("fn", torch::dispatch(c10::DispatchKey::FPGA, [&](const Tensor& x) { fpga_called = true; return x; })); m.impl("fn", c10::DispatchKey::CompositeImplicitAutograd, [&](const Tensor& x) { math_called = true; return x; }); diff --git a/aten/src/ATen/core/type.cpp b/aten/src/ATen/core/type.cpp index b94e3cd6bd87..0092b0d8056d 100644 --- a/aten/src/ATen/core/type.cpp +++ b/aten/src/ATen/core/type.cpp @@ -7,6 +7,10 @@ #include #include #include +<<<<<<< HEAD +======= +#include +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #include #include #include @@ -45,9 +49,15 @@ static_assert( "getTypePtr> not returning const ref!"); TypeVerbosity type_verbosity() { +<<<<<<< HEAD static const char* c_verbosity = std::getenv("PYTORCH_JIT_TYPE_VERBOSITY"); static TypeVerbosity verbosity = c_verbosity ? static_cast(std::stoi(c_verbosity)) : TypeVerbosity::Default; +======= + static const auto c_verbosity = c10::utils::get_env("PYTORCH_JIT_TYPE_VERBOSITY"); + static TypeVerbosity verbosity = c_verbosity ? + static_cast(std::stoi(c_verbosity.value())) : TypeVerbosity::Default; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return verbosity; } diff --git a/aten/src/ATen/core/type_factory.h b/aten/src/ATen/core/type_factory.h index 5b573b5c41e9..157451f7218e 100644 --- a/aten/src/ATen/core/type_factory.h +++ b/aten/src/ATen/core/type_factory.h @@ -44,7 +44,11 @@ struct TORCH_API TypeFactoryBase { c10::DynamicType::Arguments{}); } template +<<<<<<< HEAD C10_ERASE static c10::DynamicTypePtr get() { +======= + C10_ERASE static decltype(auto) get() { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return DynamicTypeTrait::getBaseType(); } static const std::unordered_map& basePythonTypes(); diff --git a/aten/src/ATen/cpu/vec/functional_base.h b/aten/src/ATen/cpu/vec/functional_base.h index 4d1d05ea8d32..a099e87de759 100644 --- a/aten/src/ATen/cpu/vec/functional_base.h +++ b/aten/src/ATen/cpu/vec/functional_base.h @@ -6,8 +6,29 @@ #include #include +<<<<<<< HEAD namespace at::vec { +======= +namespace at { +namespace detail { +// We prefer to convert through float for reduced-precision floating +// point types if we have a Vectorized specialization for float and we +// don't have one for the actual type in question. +template +struct should_prefer_converting_through_float + : std::bool_constant< + is_reduced_floating_point_v && + vec::is_vec_specialized_for_v && + !vec::is_vec_specialized_for_v> {}; + +template +constexpr auto should_prefer_converting_through_float_v = + should_prefer_converting_through_float::value; +} // namespace detail + +namespace vec { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // slow path template inline scalar_t vec_reduce_all( @@ -29,16 +50,33 @@ inline scalar_t vec_reduce_all( template struct VecReduceAllSIMD { +<<<<<<< HEAD static inline scalar_t apply(const Op& vec_fun, const Vectorized& acc_vec) { +======= + static inline scalar_t apply( + const Op& vec_fun, + const Vectorized& acc_vec) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return vec_reduce_all(vec_fun, acc_vec, Vectorized::size()); } }; +<<<<<<< HEAD #if defined(__GNUC__) && (__GNUC__ > 5) && !defined(_MSC_VER) && !defined(C10_MOBILE) #if defined(CPU_CAPABILITY_AVX2) template struct VecReduceAllSIMD { static inline float apply(const Op& vec_fun, const Vectorized& acc_vec) { +======= +#if defined(__GNUC__) && (__GNUC__ > 5) && !defined(_MSC_VER) && \ + !defined(C10_MOBILE) +#if defined(CPU_CAPABILITY_AVX2) +template +struct VecReduceAllSIMD { + static inline float apply( + const Op& vec_fun, + const Vectorized& acc_vec) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) using Vec = Vectorized; Vec v = acc_vec; // 128-bit shuffle @@ -57,7 +95,13 @@ struct VecReduceAllSIMD { #if defined(CPU_CAPABILITY_AVX512) template struct VecReduceAllSIMD { +<<<<<<< HEAD static inline float apply(const Op& vec_fun, const Vectorized& acc_vec) { +======= + static inline float apply( + const Op& vec_fun, + const Vectorized& acc_vec) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) using Vec = Vectorized; Vec v = acc_vec; // 256-bit shuffle @@ -76,6 +120,7 @@ struct VecReduceAllSIMD { } }; #endif // defined(CPU_CAPABILITY_AVX512) +<<<<<<< HEAD #endif // defined(__GNUC__) && (__GNUC__ > 5) && !defined(_MSC_VER) && !defined(C10_MOBILE) #if defined(__aarch64__) && !defined(C10_MOBILE) && !defined(__CUDACC__) && !defined(CPU_CAPABILITY_SVE) @@ -86,26 +131,75 @@ struct VecReduceAllSIMD { Vec v = acc_vec; // 64-bit shuffle: [a1+a5, a2+a6, a3+a7, a4+a8, -, -, -, -] -> [a3+a7, a4+a8, a1+a5, a2+a6, -, -, -, -] +======= +#endif // defined(__GNUC__) && (__GNUC__ > 5) && !defined(_MSC_VER) && + // !defined(C10_MOBILE) + +#if defined(__aarch64__) && !defined(C10_MOBILE) && !defined(__CUDACC__) && \ + !defined(CPU_CAPABILITY_SVE) +template +struct VecReduceAllSIMD { + static inline float apply( + const Op& vec_fun, + const Vectorized& acc_vec) { + using Vec = Vectorized; + Vec v = acc_vec; + + // 64-bit shuffle: [a1+a5, a2+a6, a3+a7, a4+a8, -, -, -, -] -> [a3+a7, + // a4+a8, a1+a5, a2+a6, -, -, -, -] +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) float32x4_t v1_1 = vextq_f32(v, v, 2); Vec v1 = v1_1; // [a1+a3+a5+a7, a2+a4+a6+a8, a1+a3+a5+a7, a2+a4+a6+a8, -, -, -, -] v = vec_fun(v, v1); +<<<<<<< HEAD // 32-bit shuffle: [a1+a3+a5+a7, a2+a4+a6+a8, a1+a3+a5+a7, a2+a4+a6+a8, -, -, -, -] -> [a2+a4+a6+a8, a1+a3+a5+a7, a2+a4+a6+a8, a1+a3+a5+a7, -, -, -, -] v1_1 = vrev64q_f32(v); v1 = v1_1; // [a1+a2+a3+a4+a5+a6+a7+a8, a1+a2+a3+a4+a5+a6+a7+a8, a1+a2+a3+a4+a5+a6+a7+a8, a1+a2+a3+a4+a5+a6+a7+a8, -, -, -, -] +======= + // 32-bit shuffle: [a1+a3+a5+a7, a2+a4+a6+a8, a1+a3+a5+a7, a2+a4+a6+a8, -, + // -, -, -] -> [a2+a4+a6+a8, a1+a3+a5+a7, a2+a4+a6+a8, a1+a3+a5+a7, -, -, -, + // -] + v1_1 = vrev64q_f32(v); + v1 = v1_1; + // [a1+a2+a3+a4+a5+a6+a7+a8, a1+a2+a3+a4+a5+a6+a7+a8, + // a1+a2+a3+a4+a5+a6+a7+a8, a1+a2+a3+a4+a5+a6+a7+a8, -, -, -, -] +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) v = vec_fun(v, v1); return v[0]; } }; +<<<<<<< HEAD #endif // defined(__aarch64__) #if defined(__aarch64__) && !defined(C10_MOBILE) && !defined(__CUDACC__) && defined(CPU_CAPABILITY_SVE256) template struct VecReduceAllSIMD { static inline float apply(const Op& vec_fun, const Vectorized& acc_vec) { +======= + +template <> +struct VecReduceAllSIMD>> { + static inline float apply( + const std::plus>& vec_fun, + const Vectorized& acc_vec) { + return vaddvq_f32(acc_vec); + } +}; +#endif // defined(__aarch64__) && !defined(C10_MOBILE) && !defined(__CUDACC__) + // && !defined(CPU_CAPABILITY_SVE) + +#if defined(__aarch64__) && !defined(C10_MOBILE) && !defined(__CUDACC__) && \ + defined(CPU_CAPABILITY_SVE256) +template +struct VecReduceAllSIMD { + static inline float apply( + const Op& vec_fun, + const Vectorized& acc_vec) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) using Vec = Vectorized; Vec v = acc_vec; // 128-bit shuffle @@ -123,6 +217,7 @@ struct VecReduceAllSIMD { return svlasta(svpfalse(), v); } }; +<<<<<<< HEAD #endif // defined(__aarch64__) @@ -134,6 +229,26 @@ inline scalar_t vec_reduce_all(const Op& vec_fun, const Vectorized& ac template , int> = 0> inline scalar_t reduce_all(const Op& vec_fun, const scalar_t* data, int64_t size) { +======= +#endif // defined(__aarch64__) && !defined(C10_MOBILE) && !defined(__CUDACC__) + // && defined(CPU_CAPABILITY_SVE256) + +template +inline scalar_t vec_reduce_all( + const Op& vec_fun, + const Vectorized& acc_vec) { + return VecReduceAllSIMD::apply(vec_fun, acc_vec); +} + +template < + typename scalar_t, + typename Op, + typename std::enable_if_t, int> = 0> +inline scalar_t reduce_all( + const Op& vec_fun, + const scalar_t* data, + int64_t size) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) using Vec = vec::Vectorized; if (size < Vec::size()) return vec_reduce_all(vec_fun, Vec::loadu(data, size), size); @@ -151,16 +266,34 @@ inline scalar_t reduce_all(const Op& vec_fun, const scalar_t* data, int64_t size } // similar to reduce_all, but reduces into two outputs +<<<<<<< HEAD template , int> = 0> inline std::pair reduce2_all(const Op1& vec_fun1, const Op2& vec_fun2, const scalar_t* data, int64_t size) { +======= +template < + typename scalar_t, + typename Op1, + typename Op2, + typename std::enable_if_t, int> = 0> +inline std::pair reduce2_all( + const Op1& vec_fun1, + const Op2& vec_fun2, + const scalar_t* data, + int64_t size) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) using Vec = vec::Vectorized; if (size < Vec::size()) { auto loaded_data = Vec::loadu(data, size); return std::pair( +<<<<<<< HEAD vec_reduce_all(vec_fun1, loaded_data, size), vec_reduce_all(vec_fun2, loaded_data, size)); +======= + vec_reduce_all(vec_fun1, loaded_data, size), + vec_reduce_all(vec_fun2, loaded_data, size)); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } int64_t d = Vec::size(); Vec acc_vec1 = Vec::loadu(data); @@ -176,12 +309,23 @@ inline std::pair reduce2_all(const Op1& vec_fun1, const Op2& acc_vec2 = Vec::set(acc_vec2, vec_fun2(acc_vec2, data_vec), size - d); } return std::pair( +<<<<<<< HEAD vec_reduce_all(vec_fun1, acc_vec1), vec_reduce_all(vec_fun2, acc_vec2)); } template , int> = 0> +======= + vec_reduce_all(vec_fun1, acc_vec1), vec_reduce_all(vec_fun2, acc_vec2)); +} + +template < + typename scalar_t, + typename MapOp, + typename ReduceOp, + typename std::enable_if_t, int> = 0> +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) inline scalar_t map_reduce_all( const MapOp& map_fun, const ReduceOp& red_fun, @@ -205,8 +349,16 @@ inline scalar_t map_reduce_all( return vec_reduce_all(red_fun, acc_vec); } +<<<<<<< HEAD template , int> = 0> +======= +template < + typename scalar_t, + typename MapOp, + typename ReduceOp, + typename std::enable_if_t, int> = 0> +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) inline scalar_t map2_reduce_all( const MapOp& map_fun, const ReduceOp& red_fun, @@ -237,8 +389,16 @@ inline scalar_t map2_reduce_all( return vec_reduce_all(red_fun, acc_vec); } +<<<<<<< HEAD template , int> = 0> +======= +template < + typename scalar_t, + typename MapOp, + typename ReduceOp, + typename std::enable_if_t, int> = 0> +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) inline scalar_t map3_reduce_all( const MapOp& map_fun, const ReduceOp& red_fun, @@ -274,8 +434,18 @@ inline scalar_t map3_reduce_all( return vec_reduce_all(red_fun, acc_vec); } +<<<<<<< HEAD template , int> = 0> +======= +template < + typename scalar_t, + typename Op, + typename std::enable_if_t< + !detail::should_prefer_converting_through_float_v && + std::is_invocable_v>, + int> = 0> +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) inline void map( const Op& vec_fun, scalar_t* output_data, @@ -293,8 +463,21 @@ inline void map( } } +<<<<<<< HEAD template , int> = 0> +======= +template < + typename scalar_t, + typename Op, + typename std::enable_if_t< + !detail::should_prefer_converting_through_float_v && + std::is_invocable_v< + Op, + vec::Vectorized, + vec::Vectorized>, + int> = 0> +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) inline void map2( const Op& vec_fun, scalar_t* output_data, @@ -317,8 +500,22 @@ inline void map2( } } +<<<<<<< HEAD template , int> = 0> +======= +template < + typename scalar_t, + typename Op, + typename std::enable_if_t< + !detail::should_prefer_converting_through_float_v && + std::is_invocable_v< + Op, + vec::Vectorized, + vec::Vectorized, + vec::Vectorized>, + int> = 0> +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) inline void map3( const Op& vec_fun, scalar_t* output_data, @@ -344,8 +541,23 @@ inline void map3( } } +<<<<<<< HEAD template , int> = 0> +======= +template < + typename scalar_t, + typename Op, + typename std::enable_if_t< + !detail::should_prefer_converting_through_float_v && + std::is_invocable_v< + Op, + vec::Vectorized, + vec::Vectorized, + vec::Vectorized, + vec::Vectorized>, + int> = 0> +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) inline void map4( const Op& vec_fun, scalar_t* output_data, @@ -374,4 +586,9 @@ inline void map4( } } +<<<<<<< HEAD } // namespace at::vec +======= +} // namespace vec +} // namespace at +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) diff --git a/aten/src/ATen/cpu/vec/functional_bfloat16.h b/aten/src/ATen/cpu/vec/functional_bfloat16.h index 3bd22b3820f0..7c13df08320c 100644 --- a/aten/src/ATen/cpu/vec/functional_bfloat16.h +++ b/aten/src/ATen/cpu/vec/functional_bfloat16.h @@ -6,27 +6,50 @@ #include namespace at::vec { +<<<<<<< HEAD // BFloat16 specification template struct VecScalarType { using type = scalar_t; }; template <> struct VecScalarType { using type = float; }; template <> struct VecScalarType { using type = float; }; +======= +// BFloat16 specification +template +struct VecScalarType { + using type = scalar_t; +}; +template <> +struct VecScalarType { + using type = float; +}; +template <> +struct VecScalarType { + using type = float; +}; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // This is different from at::acc_type since we only need to specialize BFloat16 template using vec_scalar_t = typename VecScalarType::type; // Vector conversion between float and bfloat16/half +<<<<<<< HEAD template , int> = 0> inline std::tuple, Vectorized> convert_to_float(const Vectorized&); template <> inline std::tuple, Vectorized> convert_to_float (const Vectorized& a) { +======= +template <> +inline std::tuple, Vectorized> convert_to_float< + BFloat16>(const Vectorized& a) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return convert_bfloat16_float(a); } template <> +<<<<<<< HEAD inline std::tuple, Vectorized> convert_to_float (const Vectorized& a) { return convert_half_float(a); } @@ -37,10 +60,22 @@ inline Vectorized convert_from_float(const Vectorized&, const V template <> inline Vectorized convert_from_float(const Vectorized& a, const Vectorized& b) { +======= +inline std::tuple, Vectorized> convert_to_float( + const Vectorized& a) { + return convert_half_float(a); +} + +template <> +inline Vectorized convert_from_float( + const Vectorized& a, + const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return convert_float_bfloat16(a, b); } template <> +<<<<<<< HEAD inline Vectorized convert_from_float(const Vectorized& a, const Vectorized& b) { return convert_float_half(a, b); } @@ -51,10 +86,32 @@ inline void load_to_float(const scalar_t *data, Vectorized &out1, Vectori template <> inline void load_to_float (const BFloat16 *data, Vectorized &out1, Vectorized &out2) { +======= +inline Vectorized convert_from_float( + const Vectorized& a, + const Vectorized& b) { + return convert_float_half(a, b); +} + +template < + typename scalar_t, + typename std::enable_if_t, int> = 0> +inline void load_to_float( + const scalar_t* data, + Vectorized& out1, + Vectorized& out2); + +template <> +inline void load_to_float( + const BFloat16* data, + Vectorized& out1, + Vectorized& out2) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) load_fp32_from_bf16(data, out1, out2); } template <> +<<<<<<< HEAD inline void load_to_float (const Half *data, Vectorized &out1, Vectorized &out2) { load_fp32_from_fp16(data, out1, out2); } @@ -65,21 +122,49 @@ inline void load_to_float(const scalar_t *data, Vectorized &out); template <> inline void load_to_float (const BFloat16 *data, Vectorized &out) { +======= +inline void load_to_float( + const Half* data, + Vectorized& out1, + Vectorized& out2) { + load_fp32_from_fp16(data, out1, out2); +} + +template < + typename scalar_t, + typename std::enable_if_t, int> = 0> +inline void load_to_float(const scalar_t* data, Vectorized& out); + +template <> +inline void load_to_float( + const BFloat16* data, + Vectorized& out) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) load_fp32_from_bf16(data, out); } template <> +<<<<<<< HEAD inline void load_to_float (const Half *data, Vectorized &out) { load_fp32_from_fp16(data, out); } // Note that we already have specialized member of Vectorized for BFloat16 // so the following functions would run smoothly: +======= +inline void load_to_float(const Half* data, Vectorized& out) { + load_fp32_from_fp16(data, out); +} + +// Note that we already have specialized member of Vectorized for +// BFloat16 so the following functions would run smoothly: +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // using Vec = Vectorized; // Vec one = Vec(BFloat16(1)); // vec::map([](Vec x) { return one / (one + x.exp()); }, y_ptr, x_ptr, N); // // Then why we still need to specialize "functional"? +<<<<<<< HEAD // If we do specialization at Vectorized<> level, the above example would need 3 pairs of // conversion of bf16->fp32/fp32->bf16, each for ".exp()", "+" and "/". // If we do specialization at vec::map<>() level, we have only 1 pair of conversion @@ -88,6 +173,17 @@ inline void load_to_float (const Half *data, Vectorized &out) { // The following BFloat16 functionality will only do data type conversion for input // and output vector (reduce functionality will only convert the final scalar back to bf16). // Compared to Vectorized<> specialization, +======= +// If we do specialization at Vectorized<> level, the above example would need +// 3 pairs of conversion of bf16->fp32/fp32->bf16, each for ".exp()", "+" and +// "/". If we do specialization at vec::map<>() level, we have only 1 pair of +// conversion of bf16->fp32/fp32->bf16, for the input and output BFloat16 +// vector only. +// +// The following BFloat16 functionality will only do data type conversion for +// input and output vector (reduce functionality will only convert the final +// scalar back to bf16). Compared to Vectorized<> specialization, +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // 1. better performance since we have less data type conversion; // 2. less rounding error since immediate results are kept in fp32; // 3. accumulation done on data type of fp32. @@ -95,8 +191,15 @@ inline void load_to_float (const Half *data, Vectorized &out) { // If you plan to extend this file, please ensure adding unit tests at // aten/src/ATen/test/vec_test_all_types.cpp // +<<<<<<< HEAD template , int> = 0> +======= +template < + typename scalar_t, + typename Op, + typename std::enable_if_t, int> = 0> +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) inline float reduce_all(const Op& vec_fun, const scalar_t* data, int64_t size) { using bVec = vec::Vectorized; using fVec = vec::Vectorized; @@ -104,7 +207,12 @@ inline float reduce_all(const Op& vec_fun, const scalar_t* data, int64_t size) { bVec data_bvec = bVec::loadu(data, size); auto [data_fvec0, data_fvec1] = convert_to_float(data_bvec); if (size > fVec::size()) { +<<<<<<< HEAD data_fvec0 = fVec::set(data_fvec0, vec_fun(data_fvec0, data_fvec1), size - fVec::size()); +======= + data_fvec0 = fVec::set( + data_fvec0, vec_fun(data_fvec0, data_fvec1), size - fVec::size()); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return vec_reduce_all(vec_fun, data_fvec0, fVec::size()); } else { return vec_reduce_all(vec_fun, data_fvec0, size); @@ -124,27 +232,55 @@ inline float reduce_all(const Op& vec_fun, const scalar_t* data, int64_t size) { auto [data_fvec0, data_fvec1] = convert_to_float(data_bvec); if (size - d > fVec::size()) { acc_fvec0 = vec_fun(acc_fvec0, data_fvec0); +<<<<<<< HEAD acc_fvec1 = fVec::set(acc_fvec1, vec_fun(acc_fvec1, data_fvec1), size - d - fVec::size()); } else { acc_fvec0 = fVec::set(acc_fvec0, vec_fun(acc_fvec0, data_fvec0), size - d); +======= + acc_fvec1 = fVec::set( + acc_fvec1, vec_fun(acc_fvec1, data_fvec1), size - d - fVec::size()); + } else { + acc_fvec0 = + fVec::set(acc_fvec0, vec_fun(acc_fvec0, data_fvec0), size - d); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } } acc_fvec0 = vec_fun(acc_fvec0, acc_fvec1); return vec_reduce_all(vec_fun, acc_fvec0); } +<<<<<<< HEAD template , int> = 0> inline std::pair reduce2_all(const Op1& vec_fun1, const Op2& vec_fun2, const scalar_t* data, int64_t size) { +======= +template < + typename scalar_t, + typename Op1, + typename Op2, + typename std::enable_if_t, int> = 0> +inline std::pair reduce2_all( + const Op1& vec_fun1, + const Op2& vec_fun2, + const scalar_t* data, + int64_t size) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) using bVec = vec::Vectorized; using fVec = vec::Vectorized; if (size < bVec::size()) { bVec data_bvec = bVec::loadu(data, size); auto [data_fvec0, data_fvec1] = convert_to_float(data_bvec); if (size > fVec::size()) { +<<<<<<< HEAD fVec acc1_fvec = fVec::set(data_fvec0, vec_fun1(data_fvec0, data_fvec1), size - fVec::size()); fVec acc2_fvec = fVec::set(data_fvec0, vec_fun2(data_fvec0, data_fvec1), size - fVec::size()); +======= + fVec acc1_fvec = fVec::set( + data_fvec0, vec_fun1(data_fvec0, data_fvec1), size - fVec::size()); + fVec acc2_fvec = fVec::set( + data_fvec0, vec_fun2(data_fvec0, data_fvec1), size - fVec::size()); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return std::pair( vec_reduce_all(vec_fun1, acc1_fvec, fVec::size()), vec_reduce_all(vec_fun2, acc2_fvec, fVec::size())); @@ -171,12 +307,29 @@ inline std::pair reduce2_all(const Op1& vec_fun1, const Op2& vec_f auto [data_fvec0, data_fvec1] = convert_to_float(data_bvec); if (size - d > fVec::size()) { acc1_fvec0 = vec_fun1(acc1_fvec0, data_fvec0); +<<<<<<< HEAD acc1_fvec1 = fVec::set(acc1_fvec1, vec_fun1(acc1_fvec1, data_fvec1), size - d - fVec::size()); acc2_fvec0 = vec_fun2(acc2_fvec0, data_fvec0); acc2_fvec1 = fVec::set(acc2_fvec1, vec_fun2(acc2_fvec1, data_fvec1), size - d - fVec::size()); } else { acc1_fvec0 = fVec::set(acc1_fvec0, vec_fun1(acc1_fvec0, data_fvec0), size - d); acc2_fvec0 = fVec::set(acc2_fvec0, vec_fun2(acc2_fvec0, data_fvec0), size - d); +======= + acc1_fvec1 = fVec::set( + acc1_fvec1, + vec_fun1(acc1_fvec1, data_fvec1), + size - d - fVec::size()); + acc2_fvec0 = vec_fun2(acc2_fvec0, data_fvec0); + acc2_fvec1 = fVec::set( + acc2_fvec1, + vec_fun2(acc2_fvec1, data_fvec1), + size - d - fVec::size()); + } else { + acc1_fvec0 = + fVec::set(acc1_fvec0, vec_fun1(acc1_fvec0, data_fvec0), size - d); + acc2_fvec0 = + fVec::set(acc2_fvec0, vec_fun2(acc2_fvec0, data_fvec0), size - d); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } } acc1_fvec0 = vec_fun1(acc1_fvec0, acc1_fvec1); @@ -186,8 +339,16 @@ inline std::pair reduce2_all(const Op1& vec_fun1, const Op2& vec_f vec_reduce_all(vec_fun2, acc2_fvec0)); } +<<<<<<< HEAD template , int> = 0> +======= +template < + typename scalar_t, + typename MapOp, + typename ReduceOp, + typename std::enable_if_t, int> = 0> +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) inline float map_reduce_all( const MapOp& map_fun, const ReduceOp& red_fun, @@ -201,7 +362,12 @@ inline float map_reduce_all( if (size > fVec::size()) { data_fvec0 = map_fun(data_fvec0); data_fvec1 = map_fun(data_fvec1); +<<<<<<< HEAD data_fvec0 = fVec::set(data_fvec0, red_fun(data_fvec0, data_fvec1), size - fVec::size()); +======= + data_fvec0 = fVec::set( + data_fvec0, red_fun(data_fvec0, data_fvec1), size - fVec::size()); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return vec_reduce_all(red_fun, data_fvec0, fVec::size()); } else { data_fvec0 = map_fun(data_fvec0); @@ -228,18 +394,35 @@ inline float map_reduce_all( data_fvec0 = map_fun(data_fvec0); data_fvec1 = map_fun(data_fvec1); acc_fvec0 = red_fun(acc_fvec0, data_fvec0); +<<<<<<< HEAD acc_fvec1 = fVec::set(acc_fvec1, red_fun(acc_fvec1, data_fvec1), size - d - fVec::size()); } else { data_fvec0 = map_fun(data_fvec0); acc_fvec0 = fVec::set(acc_fvec0, red_fun(acc_fvec0, data_fvec0), size - d); +======= + acc_fvec1 = fVec::set( + acc_fvec1, red_fun(acc_fvec1, data_fvec1), size - d - fVec::size()); + } else { + data_fvec0 = map_fun(data_fvec0); + acc_fvec0 = + fVec::set(acc_fvec0, red_fun(acc_fvec0, data_fvec0), size - d); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } } acc_fvec0 = red_fun(acc_fvec0, acc_fvec1); return vec_reduce_all(red_fun, acc_fvec0); } +<<<<<<< HEAD template , int> = 0> +======= +template < + typename scalar_t, + typename MapOp, + typename ReduceOp, + typename std::enable_if_t, int> = 0> +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) inline float map2_reduce_all( const MapOp& map_fun, const ReduceOp& red_fun, @@ -256,7 +439,12 @@ inline float map2_reduce_all( if (size > fVec::size()) { data_fvec0 = map_fun(data_fvec0, data2_fvec0); data_fvec1 = map_fun(data_fvec1, data2_fvec1); +<<<<<<< HEAD data_fvec0 = fVec::set(data_fvec0, red_fun(data_fvec0, data_fvec1), size - fVec::size()); +======= + data_fvec0 = fVec::set( + data_fvec0, red_fun(data_fvec0, data_fvec1), size - fVec::size()); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return vec_reduce_all(red_fun, data_fvec0, fVec::size()); } else { data_fvec0 = map_fun(data_fvec0, data2_fvec0); @@ -289,18 +477,35 @@ inline float map2_reduce_all( data_fvec0 = map_fun(data_fvec0, data2_fvec0); data_fvec1 = map_fun(data_fvec1, data2_fvec1); acc_fvec0 = red_fun(acc_fvec0, data_fvec0); +<<<<<<< HEAD acc_fvec1 = fVec::set(acc_fvec1, red_fun(acc_fvec1, data_fvec1), size - d - fVec::size()); } else { data_fvec0 = map_fun(data_fvec0, data2_fvec0); acc_fvec0 = fVec::set(acc_fvec0, red_fun(acc_fvec0, data_fvec0), size - d); +======= + acc_fvec1 = fVec::set( + acc_fvec1, red_fun(acc_fvec1, data_fvec1), size - d - fVec::size()); + } else { + data_fvec0 = map_fun(data_fvec0, data2_fvec0); + acc_fvec0 = + fVec::set(acc_fvec0, red_fun(acc_fvec0, data_fvec0), size - d); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } } acc_fvec0 = red_fun(acc_fvec0, acc_fvec1); return vec_reduce_all(red_fun, acc_fvec0); } +<<<<<<< HEAD template , int> = 0> +======= +template < + typename scalar_t, + typename MapOp, + typename ReduceOp, + typename std::enable_if_t, int> = 0> +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) inline float map3_reduce_all( const MapOp& map_fun, const ReduceOp& red_fun, @@ -320,7 +525,12 @@ inline float map3_reduce_all( if (size > fVec::size()) { data_fvec0 = map_fun(data_fvec0, data2_fvec0, data3_fvec0); data_fvec1 = map_fun(data_fvec1, data2_fvec1, data3_fvec1); +<<<<<<< HEAD data_fvec0 = fVec::set(data_fvec0, red_fun(data_fvec0, data_fvec1), size - fVec::size()); +======= + data_fvec0 = fVec::set( + data_fvec0, red_fun(data_fvec0, data_fvec1), size - fVec::size()); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return vec_reduce_all(red_fun, data_fvec0, fVec::size()); } else { data_fvec0 = map_fun(data_fvec0, data2_fvec0, data3_fvec0); @@ -359,18 +569,37 @@ inline float map3_reduce_all( data_fvec0 = map_fun(data_fvec0, data2_fvec0, data3_fvec0); data_fvec1 = map_fun(data_fvec1, data2_fvec1, data3_fvec1); acc_fvec0 = red_fun(acc_fvec0, data_fvec0); +<<<<<<< HEAD acc_fvec1 = fVec::set(acc_fvec1, red_fun(acc_fvec1, data_fvec1), size - d - fVec::size()); } else { data_fvec0 = map_fun(data_fvec0, data2_fvec0, data3_fvec0); acc_fvec0 = fVec::set(acc_fvec0, red_fun(acc_fvec0, data_fvec0), size - d); +======= + acc_fvec1 = fVec::set( + acc_fvec1, red_fun(acc_fvec1, data_fvec1), size - d - fVec::size()); + } else { + data_fvec0 = map_fun(data_fvec0, data2_fvec0, data3_fvec0); + acc_fvec0 = + fVec::set(acc_fvec0, red_fun(acc_fvec0, data_fvec0), size - d); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } } acc_fvec0 = red_fun(acc_fvec0, acc_fvec1); return vec_reduce_all(red_fun, acc_fvec0); } +<<<<<<< HEAD template , int> = 0> +======= +template < + typename scalar_t, + typename Op, + typename std::enable_if_t< + !(!detail::should_prefer_converting_through_float_v && + std::is_invocable_v>), + int> = 0> +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) inline void map( const Op& vec_fun, scalar_t* output_data, @@ -397,8 +626,15 @@ inline void map( } } +<<<<<<< HEAD template , int> = 0> +======= +template < + typename scalar_t, + typename Op, + typename std::enable_if_t, int> = 0> +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) inline void map( const Op& vec_fun, scalar_t* output_data, @@ -419,7 +655,12 @@ inline void map( fVec data_fvec0, data_fvec1; if (size - d > fVec::size()) { data_fvec0 = fVec::loadu(input_data + d); +<<<<<<< HEAD data_fvec1 = fVec::loadu(input_data + d + fVec::size(), size - d - fVec::size()); +======= + data_fvec1 = + fVec::loadu(input_data + d + fVec::size(), size - d - fVec::size()); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } else { // choose to align with behaviour of bVec::loadu(ptr, size), // which leaves data_fvec1 uninitialized @@ -432,8 +673,21 @@ inline void map( } } +<<<<<<< HEAD template , int> = 0> +======= +template < + typename scalar_t, + typename Op, + typename std::enable_if_t< + !(!detail::should_prefer_converting_through_float_v && + std::is_invocable_v< + Op, + vec::Vectorized, + vec::Vectorized>), + int> = 0> +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) inline void map2( const Op& vec_fun, scalar_t* output_data, @@ -465,8 +719,22 @@ inline void map2( } } +<<<<<<< HEAD template , int> = 0> +======= +template < + typename scalar_t, + typename Op, + typename std::enable_if_t< + !(!detail::should_prefer_converting_through_float_v && + std::is_invocable_v< + Op, + vec::Vectorized, + vec::Vectorized, + vec::Vectorized>), + int> = 0> +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) inline void map3( const Op& vec_fun, scalar_t* output_data, @@ -503,8 +771,23 @@ inline void map3( } } +<<<<<<< HEAD template , int> = 0> +======= +template < + typename scalar_t, + typename Op, + typename std::enable_if_t< + !(!detail::should_prefer_converting_through_float_v && + std::is_invocable_v< + Op, + vec::Vectorized, + vec::Vectorized, + vec::Vectorized, + vec::Vectorized>), + int> = 0> +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) inline void map4( const Op& vec_fun, scalar_t* output_data, @@ -525,8 +808,15 @@ inline void map4( auto [data3_fvec0, data3_fvec1] = convert_to_float(data3_bvec); bVec data4_bvec = bVec::loadu(input_data4 + d); auto [data4_fvec0, data4_fvec1] = convert_to_float(data4_bvec); +<<<<<<< HEAD fVec output_fvec0 = vec_fun(data1_fvec0, data2_fvec0, data3_fvec0, data4_fvec0); fVec output_fvec1 = vec_fun(data1_fvec1, data2_fvec1, data3_fvec1, data4_fvec1); +======= + fVec output_fvec0 = + vec_fun(data1_fvec0, data2_fvec0, data3_fvec0, data4_fvec0); + fVec output_fvec1 = + vec_fun(data1_fvec1, data2_fvec1, data3_fvec1, data4_fvec1); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) bVec output_bvec = convert_from_float(output_fvec0, output_fvec1); output_bvec.store(output_data + d); } @@ -539,8 +829,15 @@ inline void map4( auto [data3_fvec0, data3_fvec1] = convert_to_float(data3_bvec); bVec data4_bvec = bVec::loadu(input_data4 + d, size - d); auto [data4_fvec0, data4_fvec1] = convert_to_float(data4_bvec); +<<<<<<< HEAD fVec output_fvec0 = vec_fun(data1_fvec0, data2_fvec0, data3_fvec0, data4_fvec0); fVec output_fvec1 = vec_fun(data1_fvec1, data2_fvec1, data3_fvec1, data4_fvec1); +======= + fVec output_fvec0 = + vec_fun(data1_fvec0, data2_fvec0, data3_fvec0, data4_fvec0); + fVec output_fvec1 = + vec_fun(data1_fvec1, data2_fvec1, data3_fvec1, data4_fvec1); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) bVec output_bvec = convert_from_float(output_fvec0, output_fvec1); output_bvec.store(output_data + d, size - d); } diff --git a/aten/src/ATen/cpu/vec/intrinsics.h b/aten/src/ATen/cpu/vec/intrinsics.h index 48b18793b079..fd7d793e846b 100644 --- a/aten/src/ATen/cpu/vec/intrinsics.h +++ b/aten/src/ATen/cpu/vec/intrinsics.h @@ -13,10 +13,21 @@ /* Microsoft C/C++-compatible compiler */ #include #if _MSC_VER <= 1900 +<<<<<<< HEAD #define _mm256_extract_epi64(X, Y) (_mm_extract_epi64(_mm256_extractf128_si256(X, Y >> 1), Y % 2)) #define _mm256_extract_epi32(X, Y) (_mm_extract_epi32(_mm256_extractf128_si256(X, Y >> 2), Y % 4)) #define _mm256_extract_epi16(X, Y) (_mm_extract_epi16(_mm256_extractf128_si256(X, Y >> 3), Y % 8)) #define _mm256_extract_epi8(X, Y) (_mm_extract_epi8(_mm256_extractf128_si256(X, Y >> 4), Y % 16)) +======= +#define _mm256_extract_epi64(X, Y) \ + (_mm_extract_epi64(_mm256_extractf128_si256(X, Y >> 1), Y % 2)) +#define _mm256_extract_epi32(X, Y) \ + (_mm_extract_epi32(_mm256_extractf128_si256(X, Y >> 2), Y % 4)) +#define _mm256_extract_epi16(X, Y) \ + (_mm_extract_epi16(_mm256_extractf128_si256(X, Y >> 3), Y % 8)) +#define _mm256_extract_epi8(X, Y) \ + (_mm_extract_epi8(_mm256_extractf128_si256(X, Y >> 4), Y % 16)) +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #endif #elif defined(__GNUC__) && (defined(__ARM_NEON__) || defined(__aarch64__)) /* GCC-compatible compiler, targeting ARM with NEON */ @@ -25,9 +36,15 @@ /* GCC-compatible compiler, targeting ARM with SVE */ #include #endif +<<<<<<< HEAD #if defined (MISSING_ARM_VLD1) #include #elif defined (MISSING_ARM_VST1) +======= +#if defined(MISSING_ARM_VLD1) +#include +#elif defined(MISSING_ARM_VST1) +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #include #endif #elif defined(__GNUC__) && defined(__IWMMXT__) @@ -36,8 +53,13 @@ #elif defined(__s390x__) // targets Z/architecture // we will include vecintrin later +<<<<<<< HEAD #elif (defined(__GNUC__) || defined(__xlC__)) && \ (defined(__VEC__) || defined(__ALTIVEC__)) +======= +#elif (defined(__GNUC__) || defined(__xlC__)) && \ + (defined(__VEC__) || defined(__ALTIVEC__)) +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) /* XLC or GCC-compatible compiler, targeting PowerPC with VMX/VSX */ #include /* We need to undef those tokens defined by to avoid conflicts diff --git a/aten/src/ATen/cpu/vec/sve/sve_helper.h b/aten/src/ATen/cpu/vec/sve/sve_helper.h index e511ebb52b2e..3005f77aed0a 100644 --- a/aten/src/ATen/cpu/vec/sve/sve_helper.h +++ b/aten/src/ATen/cpu/vec/sve/sve_helper.h @@ -7,6 +7,7 @@ #if defined(CPU_CAPABILITY_SVE) // Define the data type of VLS(vector-length specific). +<<<<<<< HEAD typedef svbool_t vls_pred_t __attribute__((arm_sve_vector_bits(VECTOR_WIDTH * 8))); typedef svint8_t vls_int8_t __attribute__((arm_sve_vector_bits(VECTOR_WIDTH * 8))); typedef svint16_t vls_int16_t __attribute__((arm_sve_vector_bits(VECTOR_WIDTH * 8))); @@ -19,6 +20,34 @@ typedef svuint64_t vls_uint64_t __attribute__((arm_sve_vector_bits(VECTOR_WIDTH typedef svfloat16_t vls_float16_t __attribute__((arm_sve_vector_bits(VECTOR_WIDTH * 8))); typedef svfloat32_t vls_float32_t __attribute__((arm_sve_vector_bits(VECTOR_WIDTH * 8))); typedef svfloat64_t vls_float64_t __attribute__((arm_sve_vector_bits(VECTOR_WIDTH * 8))); +======= +typedef svbool_t vls_pred_t + __attribute__((arm_sve_vector_bits(VECTOR_WIDTH * 8))); +typedef svint8_t vls_int8_t + __attribute__((arm_sve_vector_bits(VECTOR_WIDTH * 8))); +typedef svint16_t vls_int16_t + __attribute__((arm_sve_vector_bits(VECTOR_WIDTH * 8))); +typedef svint32_t vls_int32_t + __attribute__((arm_sve_vector_bits(VECTOR_WIDTH * 8))); +typedef svint64_t vls_int64_t + __attribute__((arm_sve_vector_bits(VECTOR_WIDTH * 8))); +typedef svuint8_t vls_uint8_t + __attribute__((arm_sve_vector_bits(VECTOR_WIDTH * 8))); +typedef svuint16_t vls_uint16_t + __attribute__((arm_sve_vector_bits(VECTOR_WIDTH * 8))); +typedef svuint32_t vls_uint32_t + __attribute__((arm_sve_vector_bits(VECTOR_WIDTH * 8))); +typedef svuint64_t vls_uint64_t + __attribute__((arm_sve_vector_bits(VECTOR_WIDTH * 8))); +typedef svfloat16_t vls_float16_t + __attribute__((arm_sve_vector_bits(VECTOR_WIDTH * 8))); +typedef svbfloat16_t vls_bfloat16_t + __attribute__((arm_sve_vector_bits(VECTOR_WIDTH * 8))); +typedef svfloat32_t vls_float32_t + __attribute__((arm_sve_vector_bits(VECTOR_WIDTH * 8))); +typedef svfloat64_t vls_float64_t + __attribute__((arm_sve_vector_bits(VECTOR_WIDTH * 8))); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #define ptrue svptrue_b8() #define ZERO_S8 svdup_n_s8(0) @@ -32,7 +61,11 @@ typedef svfloat64_t vls_float64_t __attribute__((arm_sve_vector_bits(VECTOR_WIDT #define ZERO_F16 svdup_n_f16(0.f) #define ZERO_F32 svdup_n_f32(0.f) #define ZERO_F64 svdup_n_f64(0.0) +<<<<<<< HEAD #define ONE_S8 svdup_n_s8(1) +======= +#define ONE_S8 svdup_n_s8(1) +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #define ONE_S16 svdup_n_s16(1) #define ONE_S32 svdup_n_s32(1) #define ONE_S64 svdup_n_s64(1) @@ -41,6 +74,10 @@ typedef svfloat64_t vls_float64_t __attribute__((arm_sve_vector_bits(VECTOR_WIDT #define ONE_U32 svdup_n_u32(1) #define ONE_U64 svdup_n_u64(1) #define ONE_F16 svdup_n_f16(1.f) +<<<<<<< HEAD +======= +#define ONE_BF16 svdup_n_bf16(1.f) +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #define ONE_F32 svdup_n_f32(1.f) #define ONE_F64 svdup_n_f64(1.0) #define ALL_S8_TRUE_MASK svdup_n_s8(0xff) @@ -55,6 +92,11 @@ typedef svfloat64_t vls_float64_t __attribute__((arm_sve_vector_bits(VECTOR_WIDT #define ALL_U8_FALSE_MASK svdup_n_u8(0x00) #define ALL_F16_TRUE_MASK svreinterpret_f16_s16(ALL_S16_TRUE_MASK) #define ALL_F16_FALSE_MASK svreinterpret_f16_s16(ALL_S16_FALSE_MASK) +<<<<<<< HEAD +======= +#define ALL_BF16_TRUE_MASK svreinterpret_bf16_s16(ALL_S16_TRUE_MASK) +#define ALL_BF16_FALSE_MASK svreinterpret_bf16_s16(ALL_S16_FALSE_MASK) +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #define ALL_F32_TRUE_MASK svreinterpret_f32_s32(ALL_S32_TRUE_MASK) #define ALL_F32_FALSE_MASK svreinterpret_f32_s32(ALL_S32_FALSE_MASK) #define ALL_F64_TRUE_MASK svreinterpret_f64_s64(ALL_S64_TRUE_MASK) diff --git a/aten/src/ATen/cpu/vec/sve/vec_bfloat16.h b/aten/src/ATen/cpu/vec/sve/vec_bfloat16.h new file mode 100644 index 000000000000..7f05c2ad166f --- /dev/null +++ b/aten/src/ATen/cpu/vec/sve/vec_bfloat16.h @@ -0,0 +1,580 @@ +#pragma once + +#include +#include +#include +#include +#include +#include +namespace at { +namespace vec { +// Note [CPU_CAPABILITY namespace] +// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +// This header, and all of its subheaders, will be compiled with +// different architecture flags for each supported set of vector +// intrinsics. So we need to make sure they aren't inadvertently +// linked together. We do this by declaring objects in an `inline +// namespace` which changes the name mangling, but can still be +// accessed as `at::vec`. +inline namespace CPU_CAPABILITY { + +#if defined(CPU_CAPABILITY_SVE256) && defined(__ARM_FEATURE_BF16) + +template <> +struct is_vec_specialized_for : std::bool_constant {}; + +template <> +class Vectorized { + private: + vls_bfloat16_t values; + + public: + using value_type = BFloat16; + using size_type = int; + + static constexpr size_type size() { + return VECTOR_WIDTH / sizeof(BFloat16); + } + + Vectorized() {} + Vectorized(svbfloat16_t v) : values(v) {} + Vectorized(int val); + Vectorized(BFloat16 val); + + template < + typename... Args, + typename = std::enable_if_t<(sizeof...(Args) == size())>> + Vectorized(Args... vals) { + __at_align__ BFloat16 buffer[size()] = {vals...}; + values = svld1_bf16(ptrue, reinterpret_cast(buffer)); + } + + operator svbfloat16_t() const { + return values; + } + static Vectorized blendv( + const Vectorized& a, + const Vectorized& b, + const Vectorized& mask_) { + svbool_t mask = + svcmpeq_s16(ptrue, svreinterpret_s16_bf16(mask_), ALL_S16_TRUE_MASK); + return svsel_bf16(mask, b, a); + } + template + static Vectorized arange( + BFloat16 base = 0.f, + step_t step = static_cast(1)) { + __at_align__ BFloat16 buffer[size()]; + for (int64_t i = 0; i < size(); i++) { + buffer[i] = base + i * step; + } + return svld1_bf16(ptrue, reinterpret_cast(buffer)); + } + static Vectorized set( + const Vectorized& a, + const Vectorized& b, + int64_t count = size()) { + if (count == 0) { + return a; + } else if (count < size()) { + return svsel_bf16(svwhilelt_b16(0ull, count), b, a); + } + return b; + } + static Vectorized loadu(const void* ptr, int64_t count = size()) { + if (count == size()) + return svld1_bf16(ptrue, reinterpret_cast(ptr)); + svbool_t pg = svwhilelt_b16(0ull, count); + return svld1_bf16(pg, reinterpret_cast(ptr)); + } + void store(void* ptr, int64_t count = size()) const { + __at_align__ bfloat16_t tmp[size()]; + std::memset(tmp, 0, sizeof(tmp)); + if (count == size()) { + svst1_bf16(ptrue, reinterpret_cast(tmp), values); + } else { + svbool_t pg = svwhilelt_b16(0ull, count); + svst1_bf16(pg, reinterpret_cast(tmp), values); + } + std::memcpy( + reinterpret_cast(ptr), + reinterpret_cast(tmp), + count * sizeof(bfloat16_t)); + } + const BFloat16& operator[](int idx) const = delete; + BFloat16& operator[](int idx) = delete; + int64_t zero_mask() const { + int64_t mask = 0; + // returns an integer mask where all zero elements are translated to + // 1-bit and others are translated to 0-bit int64_t mask = 0; + __at_align__ int16_t mask_array[size()]; + + svbool_t svbool_mask = + svcmpeq_f16(ptrue, svreinterpret_f16_bf16(values), ZERO_F16); + svst1_s16( + ptrue, + mask_array, + svsel_s16(svbool_mask, ALL_S16_TRUE_MASK, ALL_S16_FALSE_MASK)); + for (int64_t i = 0; i < size(); ++i) { + if (mask_array[i]) + mask |= (1ull << i); + } + return mask; + } + Vectorized isnan() const; + bool has_inf_nan() const; + Vectorized map(BFloat16 (*f)(BFloat16)) const { + __at_align__ BFloat16 tmp[size()]; + store(tmp); + for (int64_t i = 0; i < size(); ++i) { + tmp[i] = f(tmp[i]); + } + return loadu(tmp); + } + Vectorized abs() const { + auto mask = svdup_n_u16(0x7FFF); + auto vals = svreinterpret_u16_bf16(values); + vals = svand_u16_x(ptrue, vals, mask); + return svreinterpret_bf16_u16(vals); + } + Vectorized angle() const; + Vectorized real() const { + return values; + } + Vectorized imag() const { + return Vectorized(0.f); + } + Vectorized conj() const { + return values; + } + Vectorized acos() const; + Vectorized acosh() const; + Vectorized asin() const; + Vectorized atan() const; + Vectorized atanh() const; + Vectorized atan2(const Vectorized& b) const; + Vectorized copysign(const Vectorized& sign) const; + Vectorized erf() const; + Vectorized erfc() const; + Vectorized erfinv() const; + Vectorized exp() const; + Vectorized exp2() const; + Vectorized expm1() const; + Vectorized exp_u20() const { + return exp(); + } + Vectorized fmod(const Vectorized& q) const; + Vectorized hypot(const Vectorized& b) const; + Vectorized i0() const; + Vectorized i0e() const; + Vectorized digamma() const; + Vectorized igamma(const Vectorized& x) const; + Vectorized igammac(const Vectorized& x) const; + Vectorized nextafter(const Vectorized& b) const; + Vectorized log() const; + Vectorized log2() const; + Vectorized log10() const; + Vectorized log1p() const; + Vectorized frac() const; + Vectorized sin() const; + Vectorized sinh() const; + Vectorized cos() const; + Vectorized cosh() const; + Vectorized ceil() const; + Vectorized floor() const; + Vectorized neg() const { + auto mask = svdup_n_u16(0x8000); + auto vals = svreinterpret_u16_bf16(values); + vals = sveor_u16_x(ptrue, vals, mask); + return svreinterpret_bf16_u16(vals); + }; + Vectorized round() const; + Vectorized tan() const; + Vectorized tanh() const; + Vectorized trunc() const; + Vectorized lgamma() const; + Vectorized sqrt() const; + Vectorized reciprocal() const; + Vectorized rsqrt() const; + Vectorized pow(const Vectorized& b) const; + // Comparison using the _CMP_**_OQ predicate. + // `O`: get false if an operand is NaN + // `Q`: do not raise if an operand is NaN + Vectorized operator==(const Vectorized& other) const; + + Vectorized operator!=(const Vectorized& other) const; + + Vectorized operator<(const Vectorized& other) const; + + Vectorized operator<=(const Vectorized& other) const; + + Vectorized operator>(const Vectorized& other) const; + + Vectorized operator>=(const Vectorized& other) const; + + Vectorized eq(const Vectorized& other) const; + Vectorized ne(const Vectorized& other) const; + Vectorized gt(const Vectorized& other) const; + Vectorized ge(const Vectorized& other) const; + Vectorized lt(const Vectorized& other) const; + Vectorized le(const Vectorized& other) const; +}; + +inline std::tuple, Vectorized> convert_bfloat16_float( + const Vectorized& a) { + static_assert( + Vectorized::size() == 2 * Vectorized::size()); + auto zero = svreinterpret_bf16_f32(svdup_n_f32(0.0f)); + auto bf16_vec1 = svzip1_bf16(zero, a); + auto bf16_vec2 = svzip2_bf16(zero, a); + auto x1 = svreinterpret_f32_bf16(bf16_vec1); + auto x2 = svreinterpret_f32_bf16(bf16_vec2); + return {Vectorized(x1), Vectorized(x2)}; +} + +inline Vectorized convert_float_bfloat16( + const Vectorized& a, + const Vectorized& b) { + static_assert( + Vectorized::size() == 2 * Vectorized::size()); + svbfloat16_t x1 = svcvt_bf16_f32_z(ptrue, a); + svbfloat16_t x2 = svcvt_bf16_f32_z(ptrue, b); + return Vectorized(svuzp1_bf16(x1, x2)); +} + +inline void load_fp32_from_bf16(const BFloat16* data, Vectorized& out) { + __at_align__ float values[Vectorized::size()]; + for (const auto k : c10::irange(Vectorized::size())) { + values[k] = data[k]; + } + out = Vectorized::loadu(values); +} + +inline void load_fp32_from_bf16( + const BFloat16* data, + Vectorized& out1, + Vectorized& out2) { + Vectorized bf16_vec = Vectorized::loadu(data); + auto floats = convert_bfloat16_float(bf16_vec); + out1 = std::get<0>(floats); + out2 = std::get<1>(floats); +} + +template +Vectorized binary_operator_via_float( + Op op, + const Vectorized& a, + const Vectorized& b) { + const auto [a_float_low, a_float_high] = convert_bfloat16_float(a); + const auto [b_float_low, b_float_high] = convert_bfloat16_float(b); + return convert_float_bfloat16( + op(a_float_low, b_float_low), op(a_float_high, b_float_high)); +} + +template <> +Vectorized inline operator+( + const Vectorized& a, + const Vectorized& b) { + return binary_operator_via_float(std::plus>(), a, b); +} + +template <> +Vectorized inline operator-( + const Vectorized& a, + const Vectorized& b) { + return binary_operator_via_float(std::minus>(), a, b); +} + +template <> +Vectorized inline operator*( + const Vectorized& a, + const Vectorized& b) { + return binary_operator_via_float(std::multiplies>(), a, b); +} + +template <> +Vectorized inline operator/( + const Vectorized& a, + const Vectorized& b) { + return binary_operator_via_float(std::divides>(), a, b); +} + +inline Vectorized::Vectorized(int val) { + auto vals_f = svdup_n_f32(val); + values = convert_float_bfloat16(vals_f, vals_f); +} + +inline Vectorized::Vectorized(BFloat16 val) { + auto vals_f = svdup_n_f32((float)val); + values = convert_float_bfloat16(vals_f, vals_f); +} + +bool inline Vectorized::has_inf_nan() const { + auto [v1, v2] = convert_bfloat16_float(values); + return v1.has_inf_nan() || v2.has_inf_nan(); +} +// frac. Implement this here so we can use subtraction +Vectorized inline Vectorized::frac() const { + return *this - this->trunc(); +} + +#define DEFINE_BF16_FUNC_VIA_FLOAT(func_name) \ + Vectorized inline Vectorized::func_name() const { \ + auto [v1, v2] = convert_bfloat16_float(*this); \ + v1 = v1.func_name(); \ + v2 = v2.func_name(); \ + return convert_float_bfloat16(v1, v2); \ + } + +#define DEFINE_BF16_FUNC_VIA_FLOAT_W_ARG(func_name) \ + Vectorized inline Vectorized::func_name( \ + const Vectorized& a) const { \ + auto [v1, v2] = convert_bfloat16_float(*this); \ + auto [v3, v4] = convert_bfloat16_float(a); \ + v1 = v1.func_name(v3); \ + v2 = v2.func_name(v4); \ + return convert_float_bfloat16(v1, v2); \ + } + +DEFINE_BF16_FUNC_VIA_FLOAT(isnan); +DEFINE_BF16_FUNC_VIA_FLOAT(angle); +DEFINE_BF16_FUNC_VIA_FLOAT(acos); +DEFINE_BF16_FUNC_VIA_FLOAT(acosh); +DEFINE_BF16_FUNC_VIA_FLOAT(asin); +DEFINE_BF16_FUNC_VIA_FLOAT(atan); +DEFINE_BF16_FUNC_VIA_FLOAT(atanh); +DEFINE_BF16_FUNC_VIA_FLOAT_W_ARG(atan2); +DEFINE_BF16_FUNC_VIA_FLOAT_W_ARG(copysign); +DEFINE_BF16_FUNC_VIA_FLOAT(erf); +DEFINE_BF16_FUNC_VIA_FLOAT(erfc); +DEFINE_BF16_FUNC_VIA_FLOAT(exp); +DEFINE_BF16_FUNC_VIA_FLOAT(exp2); +DEFINE_BF16_FUNC_VIA_FLOAT(expm1); +DEFINE_BF16_FUNC_VIA_FLOAT_W_ARG(fmod); +DEFINE_BF16_FUNC_VIA_FLOAT_W_ARG(hypot); +DEFINE_BF16_FUNC_VIA_FLOAT(i0); +DEFINE_BF16_FUNC_VIA_FLOAT(i0e); +DEFINE_BF16_FUNC_VIA_FLOAT(digamma); +DEFINE_BF16_FUNC_VIA_FLOAT_W_ARG(igamma); +DEFINE_BF16_FUNC_VIA_FLOAT_W_ARG(igammac); +DEFINE_BF16_FUNC_VIA_FLOAT_W_ARG(nextafter); +DEFINE_BF16_FUNC_VIA_FLOAT(log); +DEFINE_BF16_FUNC_VIA_FLOAT(log2); +DEFINE_BF16_FUNC_VIA_FLOAT(log10); +DEFINE_BF16_FUNC_VIA_FLOAT(log1p); +DEFINE_BF16_FUNC_VIA_FLOAT(sin); +DEFINE_BF16_FUNC_VIA_FLOAT(sinh); +DEFINE_BF16_FUNC_VIA_FLOAT(cos); +DEFINE_BF16_FUNC_VIA_FLOAT(cosh); +DEFINE_BF16_FUNC_VIA_FLOAT(ceil); +DEFINE_BF16_FUNC_VIA_FLOAT(floor); +DEFINE_BF16_FUNC_VIA_FLOAT(round); +DEFINE_BF16_FUNC_VIA_FLOAT(tan); +DEFINE_BF16_FUNC_VIA_FLOAT(tanh); +DEFINE_BF16_FUNC_VIA_FLOAT(trunc); +DEFINE_BF16_FUNC_VIA_FLOAT(lgamma); +DEFINE_BF16_FUNC_VIA_FLOAT(sqrt); +DEFINE_BF16_FUNC_VIA_FLOAT(reciprocal); +DEFINE_BF16_FUNC_VIA_FLOAT(rsqrt); +DEFINE_BF16_FUNC_VIA_FLOAT_W_ARG(pow); + +Vectorized inline Vectorized::operator==( + const Vectorized& other) const { + auto [f1, f2] = convert_bfloat16_float(values); + auto [f3, f4] = convert_bfloat16_float(other); + svbool_t mask1 = svcmpeq_f32(ptrue, f1, f3); + svbool_t mask2 = svcmpeq_f32(ptrue, f2, f4); + auto res1 = svsel_f32(mask1, ALL_F32_TRUE_MASK, ALL_F32_FALSE_MASK); + auto res2 = svsel_f32(mask2, ALL_F32_TRUE_MASK, ALL_F32_FALSE_MASK); + + auto bf16_1 = svreinterpret_bf16_f32(res1); + auto bf16_2 = svreinterpret_bf16_f32(res2); + return svuzp1_bf16(bf16_1, bf16_2); +} +Vectorized inline Vectorized::operator!=( + const Vectorized& other) const { + auto [f1, f2] = convert_bfloat16_float(values); + auto [f3, f4] = convert_bfloat16_float(other); + svbool_t mask1 = svcmpne_f32(ptrue, f1, f3); + svbool_t mask2 = svcmpne_f32(ptrue, f2, f4); + auto res1 = svsel_f32(mask1, ALL_F32_TRUE_MASK, ALL_F32_FALSE_MASK); + auto res2 = svsel_f32(mask2, ALL_F32_TRUE_MASK, ALL_F32_FALSE_MASK); + + auto bf16_1 = svreinterpret_bf16_f32(res1); + auto bf16_2 = svreinterpret_bf16_f32(res2); + return svuzp1_bf16(bf16_1, bf16_2); +} +Vectorized inline Vectorized::operator>( + const Vectorized& other) const { + auto [v1, v2] = convert_bfloat16_float(*this); + auto [v3, v4] = convert_bfloat16_float(other); + return convert_float_bfloat16(v1 > v3, v2 > v4); +} +Vectorized inline Vectorized::operator>=( + const Vectorized& other) const { + auto [v1, v2] = convert_bfloat16_float(*this); + auto [v3, v4] = convert_bfloat16_float(other); + return convert_float_bfloat16(v1 >= v3, v2 >= v4); +} +Vectorized inline Vectorized::operator<( + const Vectorized& other) const { + auto [v1, v2] = convert_bfloat16_float(*this); + auto [v3, v4] = convert_bfloat16_float(other); + return convert_float_bfloat16(v1 < v3, v2 < v4); +} +Vectorized inline Vectorized::operator<=( + const Vectorized& other) const { + auto [v1, v2] = convert_bfloat16_float(*this); + auto [v3, v4] = convert_bfloat16_float(other); + return convert_float_bfloat16(v1 <= v3, v2 <= v4); +} + +// Implements the IEEE 754 201X `maximum` operation, which propagates NaN if +// either input is a NaN. +template <> +Vectorized inline maximum( + const Vectorized& a, + const Vectorized& b) { + return binary_operator_via_float( + static_cast (*)( + const Vectorized&, const Vectorized&)>(&maximum), + a, + b); +} + +// Implements the IEEE 754 201X `minimum` operation, which propagates NaN if +// either input is a NaN. +template <> +Vectorized inline minimum( + const Vectorized& a, + const Vectorized& b) { + return binary_operator_via_float( + static_cast (*)( + const Vectorized&, const Vectorized&)>(&minimum), + a, + b); +} + +template <> +Vectorized inline clamp_max( + const Vectorized& a, + const Vectorized& max) { + return binary_operator_via_float( + static_cast (*)( + const Vectorized&, const Vectorized&)>(&clamp_max), + a, + max); +} + +template <> +Vectorized inline clamp_min( + const Vectorized& a, + const Vectorized& min) { + return binary_operator_via_float( + static_cast (*)( + const Vectorized&, const Vectorized&)>(&clamp_min), + a, + min); +} + +template <> +Vectorized inline clamp( + const Vectorized& a, + const Vectorized& min, + const Vectorized& max) { + return clamp_min(clamp_max(a, max), min); +} + +template <> +Vectorized inline operator&( + const Vectorized& a, + const Vectorized& b) { + return svreinterpret_bf16_u16( + svand_u16_x(ptrue, svreinterpret_u16_bf16(a), svreinterpret_u16_bf16(b))); +} + +template <> +Vectorized inline operator|( + const Vectorized& a, + const Vectorized& b) { + return svreinterpret_bf16_u16( + svorr_u16_x(ptrue, svreinterpret_u16_bf16(a), svreinterpret_u16_bf16(b))); +} + +template <> +Vectorized inline operator^( + const Vectorized& a, + const Vectorized& b) { + return svreinterpret_bf16_u16( + sveor_u16_x(ptrue, svreinterpret_u16_bf16(a), svreinterpret_u16_bf16(b))); +} + +Vectorized inline Vectorized::eq( + const Vectorized& other) const { + return (*this == other) & Vectorized(1.0f); +} + +Vectorized inline Vectorized::ne( + const Vectorized& other) const { + return (*this != other) & Vectorized(1.0f); +} + +Vectorized inline Vectorized::gt( + const Vectorized& other) const { + return (*this > other) & Vectorized(1.0f); +} + +Vectorized inline Vectorized::ge( + const Vectorized& other) const { + return (*this >= other) & Vectorized(1.0f); +} + +Vectorized inline Vectorized::lt( + const Vectorized& other) const { + return (*this < other) & Vectorized(1.0f); +} + +Vectorized inline Vectorized::le( + const Vectorized& other) const { + return (*this <= other) & Vectorized(1.0f); +} + +template <> +inline void convert(const BFloat16* src, BFloat16* dst, int64_t n) { + const int64_t fraction = n % Vectorized::size(); +#pragma unroll + for (int64_t i = 0; i < n - fraction; i += Vectorized::size()) { + svst1_bf16( + ptrue, + const_cast(reinterpret_cast(dst)) + i, + svldnt1_bf16( + ptrue, + const_cast(reinterpret_cast(src)) + + i)); + } +#pragma unroll + for (int64_t i = n - fraction; i < n; i += Vectorized::size()) { + svbool_t pg = svwhilelt_b16(i, n); + svst1_bf16( + pg, + const_cast(reinterpret_cast(dst)) + i, + svldnt1_bf16( + pg, + const_cast(reinterpret_cast(src)) + + i)); + } +} + +template <> +Vectorized inline fmadd( + const Vectorized& a, + const Vectorized& b, + const Vectorized& c) { + return a * b + c; +} + +#endif // defined(CPU_CAPABILITY_SVE) && defined(__ARM_FEATURE_BF16) + +} // namespace CPU_CAPABILITY +} // namespace vec +} // namespace at diff --git a/aten/src/ATen/cpu/vec/sve/vec_common_sve.h b/aten/src/ATen/cpu/vec/sve/vec_common_sve.h index c7968e271f91..d20556c504f9 100644 --- a/aten/src/ATen/cpu/vec/sve/vec_common_sve.h +++ b/aten/src/ATen/cpu/vec/sve/vec_common_sve.h @@ -5,17 +5,30 @@ #include +<<<<<<< HEAD #include #include #if defined(CPU_CAPABILITY_SVE) #include #include +======= +#include +#include + +#if defined(CPU_CAPABILITY_SVE) +#include +#include +#include +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #include #include #endif +<<<<<<< HEAD +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) namespace at::vec { // Note [CPU_CAPABILITY namespace] // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -30,6 +43,7 @@ inline namespace CPU_CAPABILITY { #if defined(CPU_CAPABILITY_SVE) // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CAST ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +<<<<<<< HEAD template<> inline Vectorized cast(const Vectorized& src) { @@ -71,11 +85,57 @@ template std::enable_if_t> inline gather(const float* base_addr, const Vectorized& vindex_) { svint32_t vindex = svasrd_n_s32_x(ptrue, svmul_s32_x(ptrue, vindex_, svdup_n_s32(scale)), 2); +======= +#define DEFINE_SVE_CAST(t1_t, t1_prefix, t2_t, t2_prefix) \ + template <> \ + inline Vectorized cast(const Vectorized& src) { \ + return svreinterpret_##t1_prefix##_##t2_prefix(src); \ + } \ + template <> \ + inline Vectorized cast(const Vectorized& src) { \ + return svreinterpret_##t2_prefix##_##t1_prefix(src); \ + } + +DEFINE_SVE_CAST(int64_t, s64, double, f64) +DEFINE_SVE_CAST(int32_t, s32, double, f64) +DEFINE_SVE_CAST(int16_t, s16, double, f64) +DEFINE_SVE_CAST(int64_t, s64, float, f32) +DEFINE_SVE_CAST(int32_t, s32, float, f32) +DEFINE_SVE_CAST(int16_t, s16, float, f32) +DEFINE_SVE_CAST(float, f32, double, f64) + +#ifdef __ARM_FEATURE_BF16 +DEFINE_SVE_CAST(int64_t, s64, c10::BFloat16, bf16) +DEFINE_SVE_CAST(int32_t, s32, c10::BFloat16, bf16) +DEFINE_SVE_CAST(int16_t, s16, c10::BFloat16, bf16) +#endif // __ARM_FEATURE_BF16 + +// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ GATHER ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +template +std::enable_if_t< + scale == 1 || scale == 2 || scale == 4 || scale == 8, + Vectorized< + double>> inline gather(const double* base_addr, const Vectorized& vindex_) { + svint64_t vindex = + svasrd_n_s64_x(ptrue, svmul_s64_x(ptrue, vindex_, svdup_n_s64(scale)), 3); + return svld1_gather_s64index_f64(ptrue, base_addr, vindex); +} + +template +std::enable_if_t< + scale == 1 || scale == 2 || scale == 4 || scale == 8, + Vectorized< + float>> inline gather(const float* base_addr, const Vectorized& vindex_) { + svint32_t vindex = + svasrd_n_s32_x(ptrue, svmul_s32_x(ptrue, vindex_, svdup_n_s32(scale)), 2); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return svld1_gather_s32index_f32(ptrue, base_addr, vindex); } // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ MASK GATHER ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +<<<<<<< HEAD template std::enable_if_t> inline mask_gather(const Vectorized& src, const double* base_addr, @@ -94,12 +154,43 @@ inline mask_gather(const Vectorized& src, const float* base_addr, ALL_S32_TRUE_MASK); svint32_t vindex = svasrd_n_s32_x(ptrue, svmul_s32_x(ptrue, vindex_, svdup_n_s32(scale)), 2); return svsel_f32(mask, svld1_gather_s32index_f32(mask, base_addr, vindex), src); +======= +template +std:: + enable_if_t> inline mask_gather( + const Vectorized& src, + const double* base_addr, + const Vectorized& vindex_, + const Vectorized& mask_) { + svbool_t mask = + svcmpeq_s64(ptrue, svreinterpret_s64_f64(mask_), ALL_S64_TRUE_MASK); + svint64_t vindex = + svasrd_n_s64_x(ptrue, svmul_s64_x(ptrue, vindex_, svdup_n_s64(scale)), 3); + return svsel_f64( + mask, svld1_gather_s64index_f64(mask, base_addr, vindex), src); +} + +template +std:: + enable_if_t> inline mask_gather( + const Vectorized& src, + const float* base_addr, + const Vectorized& vindex_, + const Vectorized& mask_) { + svbool_t mask = + svcmpeq_s32(ptrue, svreinterpret_s32_f32(mask_), ALL_S32_TRUE_MASK); + svint32_t vindex = + svasrd_n_s32_x(ptrue, svmul_s32_x(ptrue, vindex_, svdup_n_s32(scale)), 2); + return svsel_f32( + mask, svld1_gather_s32index_f32(mask, base_addr, vindex), src); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CONVERT ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ // Only works for inputs in the range: [-2^51, 2^51] // From: https://stackoverflow.com/a/41148578 +<<<<<<< HEAD template<> Vectorized inline convert_to_int_of_same_size(const Vectorized &src) { @@ -112,20 +203,42 @@ inline convert_to_int_of_same_size(const Vectorized &src) { template<> Vectorized inline convert_to_int_of_same_size(const Vectorized &src) { +======= +template <> +Vectorized inline convert_to_int_of_same_size( + const Vectorized& src) { + svfloat64_t x = svadd_f64_x(ptrue, src, svdup_n_f64(0x0018000000000000)); + return svsub_s64_x( + ptrue, + svreinterpret_s64_f64(x), + svreinterpret_s64_f64(svdup_n_f64(0x0018000000000000))); +} + +template <> +Vectorized inline convert_to_int_of_same_size( + const Vectorized& src) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return svcvt_s32_f32_x(ptrue, src); } // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ INTERLEAVE ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ template <> +<<<<<<< HEAD std::pair, Vectorized> inline interleave2(const Vectorized& a, const Vectorized& b) { +======= +std::pair, Vectorized> inline interleave2( + const Vectorized& a, + const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // inputs: // a = {a0, a1, a3, a3} // b = {b0, b1, b2, b3} // group cols crossing lanes: // return {a0, b0, a1, b1} // {a2, b2, a3, b3} +<<<<<<< HEAD return std::make_pair(Vectorized(svzip1_f64(a, b)), Vectorized(svzip2_f64(a, b))); } @@ -133,12 +246,24 @@ inline interleave2(const Vectorized& a, const Vectorized template <> std::pair, Vectorized> inline interleave2(const Vectorized& a, const Vectorized& b) { +======= + return std::make_pair( + Vectorized(svzip1_f64(a, b)), + Vectorized(svzip2_f64(a, b))); +} + +template <> +std::pair, Vectorized> inline interleave2( + const Vectorized& a, + const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // inputs: // a = {a0, a1, a2, a3, a4, a5, a6, a7} // b = {b0, b1, b2, b3, b4, b5, b6, b7} // group cols crossing lanes: // return {a0, b0, a1, b1, a2, b2, a3, b3} // {a4, b4, a5, b5, a6, b6, a7, b7} +<<<<<<< HEAD return std::make_pair(Vectorized(svzip1_f32(a, b)), Vectorized(svzip2_f32(a, b))); } @@ -148,12 +273,45 @@ inline interleave2(const Vectorized& a, const Vectorized& b template <> std::pair, Vectorized> inline deinterleave2(const Vectorized& a, const Vectorized& b) { +======= + return std::make_pair( + Vectorized(svzip1_f32(a, b)), Vectorized(svzip2_f32(a, b))); +} + +#ifdef __ARM_FEATURE_BF16 +template <> +std::pair< + Vectorized, + Vectorized> inline interleave2( + const Vectorized& a, + const Vectorized& b) { + // inputs: + // a = {a0, a1, a2, a3, a4, a5, a6, a7} + // b = {b0, b1, b2, b3, b4, b5, b6, b7} + // group cols crossing lanes: + // return {a0, b0, a1, b1, a2, b2, a3, b3} + // {a4, b4, a5, b5, a6, b6, a7, b7} + return std::make_pair( + Vectorized(svzip1_bf16(a, b)), + Vectorized(svzip2_bf16(a, b))); +} +#endif // __ARM_FEATURE_BF16 + +// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ DEINTERLEAVE ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +template <> +std::pair, Vectorized> inline deinterleave2( + const Vectorized& a, + const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // inputs: // a = {a0, b0, a1, b1} // b = {a2, b2, a3, b3} // swap lanes: // return {a0, a1, a2, a3} // {b0, b1, b2, b3} +<<<<<<< HEAD return std::make_pair(Vectorized(svuzp1_f64(a, b)), Vectorized(svuzp2_f64(a, b))); } @@ -161,12 +319,24 @@ inline deinterleave2(const Vectorized& a, const Vectorized std::pair, Vectorized> inline deinterleave2(const Vectorized& a, const Vectorized& b) { +======= + return std::make_pair( + Vectorized(svuzp1_f64(a, b)), + Vectorized(svuzp2_f64(a, b))); +} + +template <> +std::pair, Vectorized> inline deinterleave2( + const Vectorized& a, + const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // inputs: // a = {a0, b0, a1, b1, a2, b2, a3, b3} // b = {a4, b4, a5, b5, a6, b6, a7, b7} // swap lanes: // return {a0, a1, a2, a3, a4, a5, a6, a7} // {b0, b1, b2, b3, b4, b5, b6, b7} +<<<<<<< HEAD return std::make_pair(Vectorized(svuzp1_f32(a, b)), Vectorized(svuzp2_f32(a, b))); } @@ -174,3 +344,33 @@ inline deinterleave2(const Vectorized& a, const Vectorized& #endif // defined(CPU_CAPABILITY_SVE) }} +======= + return std::make_pair( + Vectorized(svuzp1_f32(a, b)), Vectorized(svuzp2_f32(a, b))); +} + +#ifdef __ARM_FEATURE_BF16 +template <> +std::pair< + Vectorized, + Vectorized> inline deinterleave2( + const Vectorized& a, + const Vectorized& b) { + // inputs: + // a = {a0, b0, a1, b1, a2, b2, a3, b3} + // b = {a4, b4, a5, b5, a6, b6, a7, b7} + // swap lanes: + // return {a0, a1, a2, a3, a4, a5, a6, a7} + // {b0, b1, b2, b3, b4, b5, b6, b7} + return std::make_pair( + Vectorized(svuzp1_bf16((svbfloat16_t)a, (svbfloat16_t)b)), + Vectorized(svuzp2_bf16((svbfloat16_t)a, (svbfloat16_t)b))); +} +#endif // __ARM_FEATURE_BF16 + +#endif // defined(CPU_CAPABILITY_SVE) + +} // namespace CPU_CAPABILITY +} // namespace at::vec +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) diff --git a/aten/src/ATen/cpu/vec/sve/vec_double.h b/aten/src/ATen/cpu/vec/sve/vec_double.h index 23626e29ce1c..b144ccafc2a4 100644 --- a/aten/src/ATen/cpu/vec/sve/vec_double.h +++ b/aten/src/ATen/cpu/vec/sve/vec_double.h @@ -1,8 +1,13 @@ #pragma once #include +<<<<<<< HEAD #include #include +======= +#include +#include +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #include #if defined(__aarch64__) && defined(AT_BUILD_ARM_VEC256_WITH_SLEEF) #include @@ -24,10 +29,22 @@ inline namespace CPU_CAPABILITY { #if defined(CPU_CAPABILITY_SVE) +<<<<<<< HEAD template <> class Vectorized { private: vls_float64_t values; public: +======= +template <> +struct is_vec_specialized_for : std::bool_constant {}; + +template <> +class Vectorized { + private: + vls_float64_t values; + + public: +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) using value_type = double; using size_type = int; static constexpr size_type size() { @@ -38,24 +55,41 @@ template <> class Vectorized { Vectorized(double val) { values = svdup_n_f64(val); } +<<<<<<< HEAD template> Vectorized(Args... vals) { __at_align__ double buffer[size()] = { vals... }; +======= + template < + typename... Args, + typename = std::enable_if_t<(sizeof...(Args) == size())>> + Vectorized(Args... vals) { + __at_align__ double buffer[size()] = {vals...}; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) values = svld1_f64(ptrue, buffer); } operator svfloat64_t() const { return values; } template +<<<<<<< HEAD static Vectorized blend(const Vectorized& a, const Vectorized& b) { // Build an array of flags: each element is 1 if the corresponding bit in 'mask' is set, 0 otherwise. +======= + static Vectorized blend( + const Vectorized& a, + const Vectorized& b) { + // Build an array of flags: each element is 1 if the corresponding bit in + // 'mask' is set, 0 otherwise. +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) __at_align__ int64_t flag_arr[size()]; for (int i = 0; i < size(); i++) { flag_arr[i] = (mask & (1ULL << i)) ? 1 : 0; } // Load the flag array into an SVE int64 vector. svint64_t int_mask = svld1_s64(svptrue_b64(), flag_arr); +<<<<<<< HEAD // Compare each lane of int_mask to 0; returns an svbool_t predicate where true indicates a nonzero flag. svbool_t blend_mask = svcmpne_n_s64(svptrue_b64(), int_mask, 0); @@ -71,14 +105,44 @@ template <> class Vectorized { } template static Vectorized arange(double base = 0., step_t step = static_cast(1)) { +======= + // Compare each lane of int_mask to 0; returns an svbool_t predicate where + // true indicates a nonzero flag. + svbool_t blend_mask = svcmpne_n_s64(svptrue_b64(), int_mask, 0); + + // Use svsel to select elements from b where the predicate is true, else + // from a. + svfloat64_t result = svsel(blend_mask, b.values, a.values); + return Vectorized(result); + } + static Vectorized blendv( + const Vectorized& a, + const Vectorized& b, + const Vectorized& mask_) { + svbool_t mask = + svcmpeq_s64(ptrue, svreinterpret_s64_f64(mask_), ALL_S64_TRUE_MASK); + return svsel_f64(mask, b, a); + } + template + static Vectorized arange( + double base = 0., + step_t step = static_cast(1)) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) __at_align__ double buffer[size()]; for (int64_t i = 0; i < size(); i++) { buffer[i] = base + i * step; } return svld1_f64(ptrue, buffer); } +<<<<<<< HEAD static Vectorized set(const Vectorized& a, const Vectorized& b, int64_t count = size()) { +======= + static Vectorized set( + const Vectorized& a, + const Vectorized& b, + int64_t count = size()) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if (count == 0) { return a; } else if (count < size()) { @@ -100,19 +164,37 @@ template <> class Vectorized { svst1_f64(pg, reinterpret_cast(ptr), values); } } +<<<<<<< HEAD const double& operator[](int idx) const = delete; double& operator[](int idx) = delete; int64_t zero_mask() const { // returns an integer mask where all zero elements are translated to 1-bit and others are translated to 0-bit +======= + const double& operator[](int idx) const = delete; + double& operator[](int idx) = delete; + int64_t zero_mask() const { + // returns an integer mask where all zero elements are translated to 1-bit + // and others are translated to 0-bit +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) int64_t mask = 0; __at_align__ int64_t mask_array[size()]; svbool_t svbool_mask = svcmpeq_f64(ptrue, values, ZERO_F64); +<<<<<<< HEAD svst1_s64(ptrue, mask_array, svsel_s64(svbool_mask, ALL_S64_TRUE_MASK, ALL_S64_FALSE_MASK)); for (int64_t i = 0; i < size(); ++i) { if (mask_array[i]) mask |= (1ull << i); +======= + svst1_s64( + ptrue, + mask_array, + svsel_s64(svbool_mask, ALL_S64_TRUE_MASK, ALL_S64_FALSE_MASK)); + for (int64_t i = 0; i < size(); ++i) { + if (mask_array[i]) + mask |= (1ull << i); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } return mask; } @@ -122,7 +204,13 @@ template <> class Vectorized { return svsel_f64(mask, ALL_F64_TRUE_MASK, ALL_F64_FALSE_MASK); } bool has_inf_nan() const { +<<<<<<< HEAD return svptest_any(ptrue, svcmpuo_f64(ptrue, svsub_f64_x(ptrue, values, values), ZERO_F64)); +======= + return svptest_any( + ptrue, + svcmpuo_f64(ptrue, svsub_f64_x(ptrue, values, values), ZERO_F64)); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } Vectorized map(double (*f)(double)) const { __at_align__ double tmp[size()]; @@ -155,6 +243,7 @@ template <> class Vectorized { return *this; } Vectorized acos() const { +<<<<<<< HEAD return USE_SLEEF(Vectorized(Sleef_acosdx_u10sve(values)),map(std::acos)); } Vectorized acosh() const { @@ -174,6 +263,33 @@ template <> class Vectorized { } Vectorized atan2(const Vectorized &b) const { USE_SLEEF({return Vectorized(Sleef_atan2dx_u10sve(values, b));}, +======= + return USE_SLEEF( + Vectorized(Sleef_acosdx_u10sve(values)), map(std::acos)); + } + Vectorized acosh() const { + return USE_SLEEF( + Vectorized(Sleef_acoshdx_u10sve(values)), map(std::acosh)); + } + Vectorized asin() const { + return USE_SLEEF( + Vectorized(Sleef_asindx_u10sve(values)), map(std::asin)); + } + Vectorized asinh() const { + return USE_SLEEF( + Vectorized(Sleef_asinhdx_u10sve(values)), map(std::asinh)); + } + Vectorized atan() const { + return USE_SLEEF( + Vectorized(Sleef_atandx_u10sve(values)), map(std::atan)); + } + Vectorized atanh() const { + return USE_SLEEF( + Vectorized(Sleef_atanhdx_u10sve(values)), map(std::atanh)); + } + Vectorized atan2(const Vectorized& b) const {USE_SLEEF( + { return Vectorized(Sleef_atan2dx_u10sve(values, b)); }, +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) { __at_align__ double tmp[size()]; __at_align__ double tmp_b[size()]; @@ -183,6 +299,7 @@ template <> class Vectorized { tmp[i] = std::atan2(tmp[i], tmp_b[i]); } return loadu(tmp); +<<<<<<< HEAD } ) } @@ -205,11 +322,33 @@ template <> class Vectorized { } Vectorized erfc() const { return USE_SLEEF(Vectorized(Sleef_erfcdx_u15sve(values)),map(std::erfc)); +======= + })} Vectorized copysign(const Vectorized& sign) const { + USE_SLEEF( + { return Vectorized(Sleef_copysigndx_sve(values, sign)); }, + { + __at_align__ double tmp[size()]; + __at_align__ double tmp_sign[size()]; + store(tmp); + sign.store(tmp_sign); + for (int64_t i = 0; i < size(); i++) { + tmp[i] = std::copysign(tmp[i], tmp_sign[i]); + } + return loadu(tmp); + })} Vectorized erf() const { + return USE_SLEEF( + Vectorized(Sleef_erfdx_u10sve(values)), map(std::erf)); + } + Vectorized erfc() const { + return USE_SLEEF( + Vectorized(Sleef_erfcdx_u15sve(values)), map(std::erfc)); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } Vectorized erfinv() const { return map(calc_erfinv); } Vectorized exp() const { +<<<<<<< HEAD return USE_SLEEF(Vectorized(Sleef_expdx_u10sve(values)),map(std::exp)); } Vectorized exp2() const { @@ -217,13 +356,31 @@ template <> class Vectorized { } Vectorized expm1() const { return USE_SLEEF(Vectorized(Sleef_expm1dx_u10sve(values)),map(std::expm1)); +======= + return USE_SLEEF( + Vectorized(Sleef_expdx_u10sve(values)), map(std::exp)); + } + Vectorized exp2() const { + return USE_SLEEF( + Vectorized(Sleef_exp2dx_u10sve(values)), map(std::exp2)); + } + Vectorized expm1() const { + return USE_SLEEF( + Vectorized(Sleef_expm1dx_u10sve(values)), map(std::expm1)); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } Vectorized exp_u20() const { return exp(); } +<<<<<<< HEAD Vectorized fmod(const Vectorized& q) const { USE_SLEEF({return Vectorized(Sleef_fmoddx_sve(values, q));}, { +======= + Vectorized fmod(const Vectorized& q) const {USE_SLEEF( + { return Vectorized(Sleef_fmoddx_sve(values, q)); }, + { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) __at_align__ double tmp[size()]; __at_align__ double tmp_q[size()]; store(tmp); @@ -232,6 +389,7 @@ template <> class Vectorized { tmp[i] = std::fmod(tmp[i], tmp_q[i]); } return loadu(tmp); +<<<<<<< HEAD } ) } @@ -249,6 +407,21 @@ template <> class Vectorized { }) } Vectorized i0() const { +======= + })} Vectorized hypot(const Vectorized& b) const { + USE_SLEEF( + { return Vectorized(Sleef_hypotdx_u05sve(values, b)); }, + { + __at_align__ double tmp[size()]; + __at_align__ double tmp_b[size()]; + store(tmp); + b.store(tmp_b); + for (int64_t i = 0; i < size(); i++) { + tmp[i] = std::hypot(tmp[i], tmp_b[i]); + } + return loadu(tmp); + })} Vectorized i0() const { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return map(calc_i0); } Vectorized i0e() const { @@ -257,7 +430,11 @@ template <> class Vectorized { Vectorized digamma() const { return map(calc_digamma); } +<<<<<<< HEAD Vectorized igamma(const Vectorized &x) const { +======= + Vectorized igamma(const Vectorized& x) const { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) __at_align__ double tmp[size()]; __at_align__ double tmp_x[size()]; store(tmp); @@ -267,7 +444,11 @@ template <> class Vectorized { } return loadu(tmp); } +<<<<<<< HEAD Vectorized igammac(const Vectorized &x) const { +======= + Vectorized igammac(const Vectorized& x) const { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) __at_align__ double tmp[size()]; __at_align__ double tmp_x[size()]; store(tmp); @@ -277,11 +458,16 @@ template <> class Vectorized { } return loadu(tmp); } +<<<<<<< HEAD Vectorized nextafter(const Vectorized &b) const { USE_SLEEF( { return Vectorized(Sleef_nextafterdx_sve(values, b)); }, +======= + Vectorized nextafter(const Vectorized& b) const {USE_SLEEF( + { return Vectorized(Sleef_nextafterdx_sve(values, b)); }, +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) { __at_align__ double tmp[size()]; __at_align__ double tmp_b[size()]; @@ -291,6 +477,7 @@ template <> class Vectorized { tmp[i] = std::nextafter(tmp[i], tmp_b[i]); } return loadu(tmp); +<<<<<<< HEAD } ) } @@ -318,6 +505,40 @@ template <> class Vectorized { } Vectorized cosh() const { return USE_SLEEF( Vectorized(Sleef_coshdx_u10sve(values)),map(std::cosh)); +======= + })} Vectorized log() const { + return USE_SLEEF( + Vectorized(Sleef_logdx_u10sve(values)), map(std::log)); + } + Vectorized log2() const { + return USE_SLEEF( + Vectorized(Sleef_log2dx_u10sve(values)), map(std::log2)); + } + Vectorized log10() const { + return USE_SLEEF( + Vectorized(Sleef_log10dx_u10sve(values)), map(std::log10)); + } + Vectorized log1p() const { + return USE_SLEEF( + Vectorized(Sleef_log1pdx_u10sve(values)), map(std::log1p)); + } + Vectorized frac() const; + Vectorized sin() const { + return USE_SLEEF( + Vectorized(Sleef_sindx_u10sve(values)), map(std::sin)); + } + Vectorized sinh() const { + return USE_SLEEF( + Vectorized(Sleef_sinhdx_u10sve(values)), map(std::sinh)); + } + Vectorized cos() const { + return USE_SLEEF( + Vectorized(Sleef_cosdx_u10sve(values)), map(std::cos)); + } + Vectorized cosh() const { + return USE_SLEEF( + Vectorized(Sleef_coshdx_u10sve(values)), map(std::cosh)); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } Vectorized ceil() const { return svrintp_f64_x(ptrue, values); @@ -332,16 +553,30 @@ template <> class Vectorized { return svrinti_f64_x(ptrue, values); } Vectorized tan() const { +<<<<<<< HEAD return USE_SLEEF( Vectorized(Sleef_tandx_u10sve(values)),map(std::tan)); } Vectorized tanh() const { return USE_SLEEF( Vectorized(Sleef_tanhdx_u10sve(values)),map(std::tanh)); +======= + return USE_SLEEF( + Vectorized(Sleef_tandx_u10sve(values)), map(std::tan)); + } + Vectorized tanh() const { + return USE_SLEEF( + Vectorized(Sleef_tanhdx_u10sve(values)), map(std::tanh)); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } Vectorized trunc() const { return svrintz_f64_x(ptrue, values); } Vectorized lgamma() const { +<<<<<<< HEAD return USE_SLEEF( Vectorized(Sleef_lgammadx_u10sve(values)),map(std::lgamma)); +======= + return USE_SLEEF( + Vectorized(Sleef_lgammadx_u10sve(values)), map(std::lgamma)); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } Vectorized sqrt() const { return svsqrt_f64_x(ptrue, values); @@ -352,6 +587,7 @@ template <> class Vectorized { Vectorized rsqrt() const { return svdivr_f64_x(ptrue, svsqrt_f64_x(ptrue, values), ONE_F64); } +<<<<<<< HEAD Vectorized pow(const Vectorized &b) const { USE_SLEEF( {return Vectorized(Sleef_powdx_u10sve(values, b));}, { @@ -369,6 +605,22 @@ template <> class Vectorized { // Comparison using the _CMP_**_OQ predicate. // `O`: get false if an operand is NaN // `Q`: do not raise if an operand is NaN +======= + Vectorized pow(const Vectorized& b) const {USE_SLEEF( + { return Vectorized(Sleef_powdx_u10sve(values, b)); }, + { + __at_align__ double tmp[size()]; + __at_align__ double tmp_b[size()]; + store(tmp); + b.store(tmp_b); + for (int64_t i = 0; i < size(); i++) { + tmp[i] = std::pow(tmp[i], tmp_b[i]); + } + return loadu(tmp); + })} // Comparison using the _CMP_**_OQ predicate. + // `O`: get false if an operand is NaN + // `Q`: do not raise if an operand is NaN +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) Vectorized operator==(const Vectorized& other) const { svbool_t mask = svcmpeq_f64(ptrue, values, other); return svsel_f64(mask, ALL_F64_TRUE_MASK, ALL_F64_FALSE_MASK); @@ -408,22 +660,46 @@ template <> class Vectorized { }; template <> +<<<<<<< HEAD Vectorized inline operator+(const Vectorized& a, const Vectorized& b) { +======= +Vectorized inline operator+( + const Vectorized& a, + const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return svadd_f64_x(ptrue, a, b); } template <> +<<<<<<< HEAD Vectorized inline operator-(const Vectorized& a, const Vectorized& b) { +======= +Vectorized inline operator-( + const Vectorized& a, + const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return svsub_f64_x(ptrue, a, b); } template <> +<<<<<<< HEAD Vectorized inline operator*(const Vectorized& a, const Vectorized& b) { +======= +Vectorized inline operator*( + const Vectorized& a, + const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return svmul_f64_x(ptrue, a, b); } template <> +<<<<<<< HEAD Vectorized inline operator/(const Vectorized& a, const Vectorized& b) { +======= +Vectorized inline operator/( + const Vectorized& a, + const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return svdiv_f64_x(ptrue, a, b); } @@ -435,33 +711,65 @@ Vectorized inline Vectorized::frac() const { // Implements the IEEE 754 201X `maximum` operation, which propagates NaN if // either input is a NaN. template <> +<<<<<<< HEAD Vectorized inline maximum(const Vectorized& a, const Vectorized& b) { +======= +Vectorized inline maximum( + const Vectorized& a, + const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return svmax_f64_x(ptrue, a, b); } // Implements the IEEE 754 201X `minimum` operation, which propagates NaN if // either input is a NaN. template <> +<<<<<<< HEAD Vectorized inline minimum(const Vectorized& a, const Vectorized& b) { +======= +Vectorized inline minimum( + const Vectorized& a, + const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return svmin_f64_x(ptrue, a, b); } template <> +<<<<<<< HEAD Vectorized inline clamp(const Vectorized& a, const Vectorized& min, const Vectorized& max) { +======= +Vectorized inline clamp( + const Vectorized& a, + const Vectorized& min, + const Vectorized& max) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return svmin_f64_x(ptrue, max, svmax_f64_x(ptrue, min, a)); } template <> +<<<<<<< HEAD Vectorized inline clamp_max(const Vectorized& a, const Vectorized& max) { +======= +Vectorized inline clamp_max( + const Vectorized& a, + const Vectorized& max) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return svmin_f64_x(ptrue, max, a); } template <> +<<<<<<< HEAD Vectorized inline clamp_min(const Vectorized& a, const Vectorized& min) { +======= +Vectorized inline clamp_min( + const Vectorized& a, + const Vectorized& min) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return svmax_f64_x(ptrue, min, a); } template <> +<<<<<<< HEAD Vectorized inline operator&(const Vectorized& a, const Vectorized& b) { return svreinterpret_f64_s64(svand_s64_x(ptrue, svreinterpret_s64_f64(a), svreinterpret_s64_f64(b))); } @@ -497,6 +805,58 @@ Vectorized inline Vectorized::lt(const Vectorized& other } Vectorized inline Vectorized::le(const Vectorized& other) const { +======= +Vectorized inline operator&( + const Vectorized& a, + const Vectorized& b) { + return svreinterpret_f64_s64( + svand_s64_x(ptrue, svreinterpret_s64_f64(a), svreinterpret_s64_f64(b))); +} + +template <> +Vectorized inline operator|( + const Vectorized& a, + const Vectorized& b) { + return svreinterpret_f64_s64( + svorr_s64_x(ptrue, svreinterpret_s64_f64(a), svreinterpret_s64_f64(b))); +} + +template <> +Vectorized inline operator^( + const Vectorized& a, + const Vectorized& b) { + return svreinterpret_f64_s64( + sveor_s64_x(ptrue, svreinterpret_s64_f64(a), svreinterpret_s64_f64(b))); +} + +Vectorized inline Vectorized::eq( + const Vectorized& other) const { + return (*this == other) & Vectorized(1.0); +} + +Vectorized inline Vectorized::ne( + const Vectorized& other) const { + return (*this != other) & Vectorized(1.0); +} + +Vectorized inline Vectorized::gt( + const Vectorized& other) const { + return (*this > other) & Vectorized(1.0); +} + +Vectorized inline Vectorized::ge( + const Vectorized& other) const { + return (*this >= other) & Vectorized(1.0); +} + +Vectorized inline Vectorized::lt( + const Vectorized& other) const { + return (*this < other) & Vectorized(1.0); +} + +Vectorized inline Vectorized::le( + const Vectorized& other) const { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return (*this <= other) & Vectorized(1.0); } @@ -515,10 +875,22 @@ inline void convert(const double* src, double* dst, int64_t n) { } template <> +<<<<<<< HEAD Vectorized inline fmadd(const Vectorized& a, const Vectorized& b, const Vectorized& c) { +======= +Vectorized inline fmadd( + const Vectorized& a, + const Vectorized& b, + const Vectorized& c) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return svmad_f64_x(ptrue, a, b, c); } #endif // defined(CPU_CAPABILITY_SVE) +<<<<<<< HEAD }} +======= +} // namespace CPU_CAPABILITY +} // namespace at::vec +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) diff --git a/aten/src/ATen/cpu/vec/sve/vec_float.h b/aten/src/ATen/cpu/vec/sve/vec_float.h index 6a3dc2bc1c10..bbaa6e686cf7 100644 --- a/aten/src/ATen/cpu/vec/sve/vec_float.h +++ b/aten/src/ATen/cpu/vec/sve/vec_float.h @@ -1,8 +1,13 @@ #pragma once #include +<<<<<<< HEAD #include #include +======= +#include +#include +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #include #if defined(__aarch64__) && defined(AT_BUILD_ARM_VEC256_WITH_SLEEF) #include @@ -24,10 +29,22 @@ inline namespace CPU_CAPABILITY { #if defined(CPU_CAPABILITY_SVE) +<<<<<<< HEAD template <> class Vectorized { private: vls_float32_t values; public: +======= +template <> +struct is_vec_specialized_for : std::bool_constant {}; + +template <> +class Vectorized { + private: + vls_float32_t values; + + public: +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) using value_type = float; using size_type = int; static constexpr size_type size() { @@ -38,16 +55,25 @@ template <> class Vectorized { Vectorized(float val) { values = svdup_n_f32(val); } +<<<<<<< HEAD template> Vectorized(Args... vals) { __at_align__ float buffer[size()] = { vals... }; +======= + template < + typename... Args, + typename = std::enable_if_t<(sizeof...(Args) == size())>> + Vectorized(Args... vals) { + __at_align__ float buffer[size()] = {vals...}; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) values = svld1_f32(ptrue, buffer); } operator svfloat32_t() const { return values; } template +<<<<<<< HEAD static Vectorized blend(const Vectorized& a, const Vectorized& b) { // Build an array of flags: each element is 1 if the corresponding bit in 'mask' is set, 0 otherwise. __at_align__ int32_t flag_arr[size()]; @@ -70,14 +96,54 @@ template <> class Vectorized { } template static Vectorized arange(float base = 0.f, step_t step = static_cast(1)) { +======= + static Vectorized blend( + const Vectorized& a, + const Vectorized& b) { + // Build an array of flags: each element is 1 if the corresponding bit in + // 'mask' is set, 0 otherwise. + __at_align__ int32_t flag_arr[size()]; + for (int i = 0; i < size(); i++) { + flag_arr[i] = (mask & (1ULL << i)) ? 1 : 0; + } + // Load the flag array into an SVE int32 vector. + svint32_t int_mask = svld1_s32(svptrue_b32(), flag_arr); + // Compare each lane of int_mask to 0; returns an svbool_t predicate where + // true indicates a nonzero flag. + svbool_t blend_mask = svcmpne_n_s32(svptrue_b32(), int_mask, 0); + // Use svsel to select elements from b where the predicate is true, else + // from a. + svfloat32_t result = svsel_f32(blend_mask, b.values, a.values); + return Vectorized(result); + } + static Vectorized blendv( + const Vectorized& a, + const Vectorized& b, + const Vectorized& mask_) { + svbool_t mask = + svcmpeq_s32(ptrue, svreinterpret_s32_f32(mask_), ALL_S32_TRUE_MASK); + return svsel_f32(mask, b, a); + } + template + static Vectorized arange( + float base = 0.f, + step_t step = static_cast(1)) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) __at_align__ float buffer[size()]; for (int64_t i = 0; i < size(); i++) { buffer[i] = base + i * step; } return svld1_f32(ptrue, buffer); } +<<<<<<< HEAD static Vectorized set(const Vectorized& a, const Vectorized& b, int64_t count = size()) { +======= + static Vectorized set( + const Vectorized& a, + const Vectorized& b, + int64_t count = size()) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if (count == 0) { return a; } else if (count < size()) { @@ -85,6 +151,74 @@ template <> class Vectorized { } return b; } +<<<<<<< HEAD +======= + // Implementation is picked from + // https://github.com/ARM-software/ComputeLibrary/blob/v25.01/src/core/NEON/SVEMath.inl#L105 + inline svfloat32_t svexp_f32_z(svbool_t pg, svfloat32_t x) const { + const auto c1 = + svreinterpret_f32_u32(svdup_n_u32(0x3f7ffff6)); // x^1: 0x1.ffffecp-1f + const auto c2 = + svreinterpret_f32_u32(svdup_n_u32(0x3efffedb)); // x^2: 0x1.fffdb6p-2f + const auto c3 = + svreinterpret_f32_u32(svdup_n_u32(0x3e2aaf33)); // x^3: 0x1.555e66p-3f + const auto c4 = + svreinterpret_f32_u32(svdup_n_u32(0x3d2b9f17)); // x^4: 0x1.573e2ep-5f + const auto c5 = + svreinterpret_f32_u32(svdup_n_u32(0x3c072010)); // x^5: 0x1.0e4020p-7f + const auto shift = svreinterpret_f32_u32( + svdup_n_u32(0x4b00007f)); // 2^23 + 127 = 0x1.0000fep23f + const auto inv_ln2 = svreinterpret_f32_u32( + svdup_n_u32(0x3fb8aa3b)); // 1 / ln(2) = 0x1.715476p+0f + const auto neg_ln2_hi = svreinterpret_f32_u32(svdup_n_u32( + 0xbf317200)); // -ln(2) from bits -1 to -19: -0x1.62e400p-1f + const auto neg_ln2_lo = svreinterpret_f32_u32(svdup_n_u32( + 0xb5bfbe8e)); // -ln(2) from bits -20 to -42: -0x1.7f7d1cp-20f + const auto inf = svdup_n_f32(std::numeric_limits::infinity()); + const auto max_input = svdup_n_f32(88.37f); // Approximately ln(2^127.5) + const auto zero = svdup_n_f32(0.f); + const auto min_input = svdup_n_f32(-86.64f); // Approximately ln(2^-125) + // Range reduction: + // e^x = 2^n * e^r + // where: + // n = floor(x / ln(2)) + // r = x - n * ln(2) + // + // By adding x / ln(2) with 2^23 + 127 (shift): + // * As FP32 fraction part only has 23-bits, the addition of 2^23 + 127 + // forces decimal part + // of x / ln(2) out of the result. The integer part of x / ln(2) (i.e. + // n) + 127 will occupy the whole fraction part of z in FP32 format. + // Subtracting 2^23 + 127 (shift) from z will result in the integer part + // of x / ln(2) (i.e. n) because the decimal part has been pushed out + // and lost. + // * The addition of 127 makes the FP32 fraction part of z ready to be + // used as the exponent + // in FP32 format. Left shifting z by 23 bits will result in 2^n. + const auto z = svmla_f32_z(pg, shift, x, inv_ln2); + const auto n = svsub_f32_z(pg, z, shift); + const auto scale = svreinterpret_f32_u32( + svlsl_n_u32_z(pg, svreinterpret_u32_f32(z), 23)); // 2^n + // The calculation of n * ln(2) is done using 2 steps to achieve accuracy + // beyond FP32. This outperforms longer Taylor series (3-4 tabs) both in + // term of accuracy and performance. + const auto r_hi = svmla_f32_z(pg, x, n, neg_ln2_hi); + const auto r = svmla_f32_z(pg, r_hi, n, neg_ln2_lo); + // Compute the truncated Taylor series of e^r. + // poly = scale * (1 + c1 * r + c2 * r^2 + c3 * r^3 + c4 * r^4 + c5 * r^5) + const auto r2 = svmul_f32_z(pg, r, r); + const auto p1 = svmul_f32_z(pg, c1, r); + const auto p23 = svmla_f32_z(pg, c2, c3, r); + const auto p45 = svmla_f32_z(pg, c4, c5, r); + const auto p2345 = svmla_f32_z(pg, p23, p45, r2); + const auto p12345 = svmla_f32_z(pg, p1, p2345, r2); + auto poly = svmla_f32_z(pg, scale, p12345, scale); + // Handle underflow and overflow. + poly = svsel_f32(svcmplt_f32(pg, x, min_input), zero, poly); + poly = svsel_f32(svcmpgt_f32(pg, x, max_input), inf, poly); + return poly; + } +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) static Vectorized loadu(const void* ptr, int64_t count = size()) { if (count == size()) return svld1_f32(ptrue, reinterpret_cast(ptr)); @@ -99,19 +233,37 @@ template <> class Vectorized { svst1_f32(pg, reinterpret_cast(ptr), values); } } +<<<<<<< HEAD const float& operator[](int idx) const = delete; float& operator[](int idx) = delete; int64_t zero_mask() const { // returns an integer mask where all zero elements are translated to 1-bit and others are translated to 0-bit +======= + const float& operator[](int idx) const = delete; + float& operator[](int idx) = delete; + int64_t zero_mask() const { + // returns an integer mask where all zero elements are translated to 1-bit + // and others are translated to 0-bit +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) int64_t mask = 0; __at_align__ int32_t mask_array[size()]; svbool_t svbool_mask = svcmpeq_f32(ptrue, values, ZERO_F32); +<<<<<<< HEAD svst1_s32(ptrue, mask_array, svsel_s32(svbool_mask, ALL_S32_TRUE_MASK, ALL_S32_FALSE_MASK)); for (int64_t i = 0; i < size(); ++i) { if (mask_array[i]) mask |= (1ull << i); +======= + svst1_s32( + ptrue, + mask_array, + svsel_s32(svbool_mask, ALL_S32_TRUE_MASK, ALL_S32_FALSE_MASK)); + for (int64_t i = 0; i < size(); ++i) { + if (mask_array[i]) + mask |= (1ull << i); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } return mask; } @@ -121,7 +273,13 @@ template <> class Vectorized { return svsel_f32(mask, ALL_F32_TRUE_MASK, ALL_F32_FALSE_MASK); } bool has_inf_nan() const { +<<<<<<< HEAD return svptest_any(ptrue, svcmpuo_f32(ptrue, svsub_f32_x(ptrue, values, values), ZERO_F32)); +======= + return svptest_any( + ptrue, + svcmpuo_f32(ptrue, svsub_f32_x(ptrue, values, values), ZERO_F32)); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } Vectorized map(float (*f)(float)) const { __at_align__ float tmp[size()]; @@ -154,6 +312,7 @@ template <> class Vectorized { return values; } Vectorized acos() const { +<<<<<<< HEAD return USE_SLEEF(Vectorized(Sleef_acosfx_u10sve(values)),map(std::acos)); } Vectorized acosh() const { @@ -173,11 +332,39 @@ template <> class Vectorized { } Vectorized atan2(const Vectorized &b) const { USE_SLEEF({return Vectorized(Sleef_atan2fx_u10sve(values, b));}, +======= + return USE_SLEEF( + Vectorized(Sleef_acosfx_u10sve(values)), map(std::acos)); + } + Vectorized acosh() const { + return USE_SLEEF( + Vectorized(Sleef_acoshfx_u10sve(values)), map(std::acosh)); + } + Vectorized asin() const { + return USE_SLEEF( + Vectorized(Sleef_asinfx_u10sve(values)), map(std::asin)); + } + Vectorized asinh() const { + return USE_SLEEF( + Vectorized(Sleef_asinhfx_u10sve(values)), map(std::asinh)); + } + Vectorized atan() const { + return USE_SLEEF( + Vectorized(Sleef_atanfx_u10sve(values)), map(std::atan)); + } + Vectorized atanh() const { + return USE_SLEEF( + Vectorized(Sleef_atanhfx_u10sve(values)), map(std::atanh)); + } + Vectorized atan2(const Vectorized& b) const {USE_SLEEF( + { return Vectorized(Sleef_atan2fx_u10sve(values, b)); }, +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) { __at_align__ float tmp[size()]; __at_align__ float tmp_b[size()]; store(tmp); b.store(tmp_b); +<<<<<<< HEAD for (int64_t i = 0; i < size(); i++){ tmp[i] = std::atan2(tmp[i], tmp_b[i]); } @@ -204,11 +391,38 @@ template <> class Vectorized { } Vectorized erfc() const { return USE_SLEEF(Vectorized(Sleef_erfcfx_u15sve(values)),map(std::erfc)); +======= + for (int64_t i = 0; i < size(); i++) { + tmp[i] = std::atan2(tmp[i], tmp_b[i]); + } + return loadu(tmp); + })} Vectorized copysign(const Vectorized& sign) const { + + USE_SLEEF( + { return Vectorized(Sleef_copysignfx_sve(values, sign)); }, + { + __at_align__ float tmp[size()]; + __at_align__ float tmp_sign[size()]; + store(tmp); + sign.store(tmp_sign); + for (int64_t i = 0; i < size(); ++i) { + tmp[i] = std::copysign(tmp[i], tmp_sign[i]); + } + return loadu(tmp); + })} Vectorized erf() const { + return USE_SLEEF( + Vectorized(Sleef_erffx_u10sve(values)), map(std::erf)); + } + Vectorized erfc() const { + return USE_SLEEF( + Vectorized(Sleef_erfcfx_u15sve(values)), map(std::erfc)); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } Vectorized erfinv() const { return map(calc_erfinv); } Vectorized exp() const { +<<<<<<< HEAD return USE_SLEEF(Vectorized(Sleef_expfx_u10sve(values)),map(std::exp)); } Vectorized exp2() const { @@ -216,13 +430,31 @@ template <> class Vectorized { } Vectorized expm1() const { return USE_SLEEF(Vectorized(Sleef_expm1fx_u10sve(values)),map(std::expm1)); +======= + return USE_SLEEF( + Vectorized(Sleef_expfx_u10sve(values)), map(std::exp)); + } + Vectorized exp2() const { + return USE_SLEEF( + Vectorized(Sleef_exp2fx_u10sve(values)), map(std::exp2)); + } + Vectorized expm1() const { + return USE_SLEEF( + Vectorized(Sleef_expm1fx_u10sve(values)), map(std::expm1)); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } Vectorized exp_u20() const { return exp(); } +<<<<<<< HEAD Vectorized fmod(const Vectorized& q) const { USE_SLEEF({return Vectorized(Sleef_fmodfx_sve(values, q));}, { +======= + Vectorized fmod(const Vectorized& q) const {USE_SLEEF( + { return Vectorized(Sleef_fmodfx_sve(values, q)); }, + { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) __at_align__ float tmp[size()]; __at_align__ float tmp_q[size()]; store(tmp); @@ -231,6 +463,7 @@ template <> class Vectorized { tmp[i] = std::fmod(tmp[i], tmp_q[i]); } return loadu(tmp); +<<<<<<< HEAD }) } Vectorized hypot(const Vectorized &b) const { @@ -248,6 +481,21 @@ template <> class Vectorized { ) } Vectorized i0() const { +======= + })} Vectorized hypot(const Vectorized& b) const { + USE_SLEEF( + { return Vectorized(Sleef_hypotfx_u05sve(values, b)); }, + { + __at_align__ float tmp[size()]; + __at_align__ float tmp_b[size()]; + store(tmp); + b.store(tmp_b); + for (int64_t i = 0; i < size(); i++) { + tmp[i] = std::hypot(tmp[i], tmp_b[i]); + } + return loadu(tmp); + })} Vectorized i0() const { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return map(calc_i0); } Vectorized i0e() const { @@ -256,7 +504,11 @@ template <> class Vectorized { Vectorized digamma() const { return map(calc_digamma); } +<<<<<<< HEAD Vectorized igamma(const Vectorized &x) const { +======= + Vectorized igamma(const Vectorized& x) const { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) __at_align__ float tmp[size()]; __at_align__ float tmp_x[size()]; store(tmp); @@ -266,7 +518,11 @@ template <> class Vectorized { } return loadu(tmp); } +<<<<<<< HEAD Vectorized igammac(const Vectorized &x) const { +======= + Vectorized igammac(const Vectorized& x) const { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) __at_align__ float tmp[size()]; __at_align__ float tmp_x[size()]; store(tmp); @@ -276,11 +532,16 @@ template <> class Vectorized { } return loadu(tmp); } +<<<<<<< HEAD Vectorized nextafter(const Vectorized &b) const { USE_SLEEF( { return Vectorized(Sleef_nextafterfx_sve(values, b)); }, +======= + Vectorized nextafter(const Vectorized& b) const {USE_SLEEF( + { return Vectorized(Sleef_nextafterfx_sve(values, b)); }, +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) { __at_align__ float tmp[size()]; __at_align__ float tmp_b[size()]; @@ -290,6 +551,7 @@ template <> class Vectorized { tmp[i] = std::nextafter(tmp[i], tmp_b[i]); } return loadu(tmp); +<<<<<<< HEAD } ) } @@ -317,6 +579,40 @@ template <> class Vectorized { } Vectorized cosh() const { return USE_SLEEF(Vectorized(Sleef_coshfx_u10sve(values)),map(std::cosh)); +======= + })} Vectorized log() const { + return USE_SLEEF( + Vectorized(Sleef_logfx_u10sve(values)), map(std::log)); + } + Vectorized log2() const { + return USE_SLEEF( + Vectorized(Sleef_log2fx_u10sve(values)), map(std::log2)); + } + Vectorized log10() const { + return USE_SLEEF( + Vectorized(Sleef_log10fx_u10sve(values)), map(std::log10)); + } + Vectorized log1p() const { + return USE_SLEEF( + Vectorized(Sleef_log1pfx_u10sve(values)), map(std::log1p)); + } + Vectorized frac() const; + Vectorized sin() const { + return USE_SLEEF( + Vectorized(Sleef_sinfx_u10sve(values)), map(std::sin)); + } + Vectorized sinh() const { + return USE_SLEEF( + Vectorized(Sleef_sinhfx_u10sve(values)), map(std::sinh)); + } + Vectorized cos() const { + return USE_SLEEF( + Vectorized(Sleef_cosfx_u10sve(values)), map(std::cos)); + } + Vectorized cosh() const { + return USE_SLEEF( + Vectorized(Sleef_coshfx_u10sve(values)), map(std::cosh)); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } Vectorized ceil() const { return svrintp_f32_x(ptrue, values); @@ -331,16 +627,67 @@ template <> class Vectorized { return svrinti_f32_x(ptrue, values); } Vectorized tan() const { +<<<<<<< HEAD return USE_SLEEF(Vectorized(Sleef_tanfx_u10sve(values)),map(std::tan)); } Vectorized tanh() const { return USE_SLEEF(Vectorized(Sleef_tanhfx_u10sve(values)),map(std::tanh)); +======= + return USE_SLEEF( + Vectorized(Sleef_tanfx_u10sve(values)), map(std::tan)); + } + // Implementation is picked from + // https://github.com/ARM-software/ComputeLibrary/blob/v25.01/src/core/NEON/SVEMath.inl#L179 + Vectorized tanh() const { + // Constants used for the tanh calculation. + const svfloat32_t CONST_1 = + svdup_n_f32(1.f); // Constant 1.0f for the tanh formula. + const svfloat32_t CONST_2 = svdup_n_f32( + 2.f); // Constant 2.0f for the tanh formula (used in exp(2x)). + const svfloat32_t CONST_MIN_TANH = svdup_n_f32( + -10.f); // Minimum threshold for input values to prevent overflow. + const svfloat32_t CONST_MAX_TANH = svdup_n_f32( + 10.f); // Maximum threshold for input values to prevent overflow. + + // Step 1: Clamp the values within the range [-10, 10] to prevent overflow + // during exponentiation. The tanh function approaches ±1 rapidly as the + // input grows large, so we limit the input range to avoid numerical + // instability. svmax_f32_z ensures values are greater than -10, and + // svmin_f32_z ensures they are less than 10. + svfloat32_t x = svmin_f32_z( + ptrue, svmax_f32_z(ptrue, values, CONST_MIN_TANH), CONST_MAX_TANH); + + // Step 2: Calculate exp(2 * x), where x is the clamped value. + // svmul_f32_z computes 2 * x, and svexp_f32_z computes the exponential of + // the result. + svfloat32_t exp2x = svexp_f32_z(ptrue, svmul_f32_z(ptrue, CONST_2, x)); + + // Step 3: Calculate the numerator of the tanh function, which is exp(2x) + // - 1. + svfloat32_t num = svsub_f32_z(ptrue, exp2x, CONST_1); + + // Step 4: Calculate the denominator of the tanh function, which is exp(2x) + // + 1. + svfloat32_t den = svadd_f32_z(ptrue, exp2x, CONST_1); + + // Step 5: Calculate the tanh function as the ratio of the numerator and + // denominator: num / den. + svfloat32_t tanh = svdiv_f32_z(ptrue, num, den); + + // Return the calculated tanh values. + return tanh; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } Vectorized trunc() const { return svrintz_f32_x(ptrue, values); } Vectorized lgamma() const { +<<<<<<< HEAD return USE_SLEEF(Vectorized(Sleef_lgammafx_u10sve(values)),map(std::lgamma)); +======= + return USE_SLEEF( + Vectorized(Sleef_lgammafx_u10sve(values)), map(std::lgamma)); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } Vectorized sqrt() const { return svsqrt_f32_x(ptrue, values); @@ -351,6 +698,7 @@ template <> class Vectorized { Vectorized rsqrt() const { return svdivr_f32_x(ptrue, svsqrt_f32_x(ptrue, values), ONE_F32); } +<<<<<<< HEAD Vectorized pow(const Vectorized &b) const { USE_SLEEF( {return Vectorized(Sleef_powfx_u10sve(values, b));}, { @@ -368,6 +716,22 @@ template <> class Vectorized { // Comparison using the _CMP_**_OQ predicate. // `O`: get false if an operand is NaN // `Q`: do not raise if an operand is NaN +======= + Vectorized pow(const Vectorized& b) const {USE_SLEEF( + { return Vectorized(Sleef_powfx_u10sve(values, b)); }, + { + __at_align__ float tmp[size()]; + __at_align__ float tmp_b[size()]; + store(tmp); + b.store(tmp_b); + for (int64_t i = 0; i < size(); i++) { + tmp[i] = std::pow(tmp[i], tmp_b[i]); + } + return loadu(tmp); + })} // Comparison using the _CMP_**_OQ predicate. + // `O`: get false if an operand is NaN + // `Q`: do not raise if an operand is NaN +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) Vectorized operator==(const Vectorized& other) const { svbool_t mask = svcmpeq_f32(ptrue, values, other); return svsel_f32(mask, ALL_F32_TRUE_MASK, ALL_F32_FALSE_MASK); @@ -407,22 +771,46 @@ template <> class Vectorized { }; template <> +<<<<<<< HEAD Vectorized inline operator+(const Vectorized& a, const Vectorized& b) { +======= +Vectorized inline operator+( + const Vectorized& a, + const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return svadd_f32_x(ptrue, a, b); } template <> +<<<<<<< HEAD Vectorized inline operator-(const Vectorized& a, const Vectorized& b) { +======= +Vectorized inline operator-( + const Vectorized& a, + const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return svsub_f32_x(ptrue, a, b); } template <> +<<<<<<< HEAD Vectorized inline operator*(const Vectorized& a, const Vectorized& b) { +======= +Vectorized inline operator*( + const Vectorized& a, + const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return svmul_f32_x(ptrue, a, b); } template <> +<<<<<<< HEAD Vectorized inline operator/(const Vectorized& a, const Vectorized& b) { +======= +Vectorized inline operator/( + const Vectorized& a, + const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return svdiv_f32_x(ptrue, a, b); } @@ -434,33 +822,65 @@ Vectorized inline Vectorized::frac() const { // Implements the IEEE 754 201X `maximum` operation, which propagates NaN if // either input is a NaN. template <> +<<<<<<< HEAD Vectorized inline maximum(const Vectorized& a, const Vectorized& b) { +======= +Vectorized inline maximum( + const Vectorized& a, + const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return svmax_f32_x(ptrue, a, b); } // Implements the IEEE 754 201X `minimum` operation, which propagates NaN if // either input is a NaN. template <> +<<<<<<< HEAD Vectorized inline minimum(const Vectorized& a, const Vectorized& b) { +======= +Vectorized inline minimum( + const Vectorized& a, + const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return svmin_f32_x(ptrue, a, b); } template <> +<<<<<<< HEAD Vectorized inline clamp(const Vectorized& a, const Vectorized& min, const Vectorized& max) { +======= +Vectorized inline clamp( + const Vectorized& a, + const Vectorized& min, + const Vectorized& max) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return svmin_f32_x(ptrue, max, svmax_f32_x(ptrue, min, a)); } template <> +<<<<<<< HEAD Vectorized inline clamp_max(const Vectorized& a, const Vectorized& max) { +======= +Vectorized inline clamp_max( + const Vectorized& a, + const Vectorized& max) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return svmin_f32_x(ptrue, max, a); } template <> +<<<<<<< HEAD Vectorized inline clamp_min(const Vectorized& a, const Vectorized& min) { +======= +Vectorized inline clamp_min( + const Vectorized& a, + const Vectorized& min) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return svmax_f32_x(ptrue, min, a); } template <> +<<<<<<< HEAD Vectorized inline operator&(const Vectorized& a, const Vectorized& b) { return svreinterpret_f32_s32(svand_s32_x(ptrue, svreinterpret_s32_f32(a), svreinterpret_s32_f32(b))); } @@ -496,6 +916,58 @@ Vectorized inline Vectorized::lt(const Vectorized& other) c } Vectorized inline Vectorized::le(const Vectorized& other) const { +======= +Vectorized inline operator&( + const Vectorized& a, + const Vectorized& b) { + return svreinterpret_f32_s32( + svand_s32_x(ptrue, svreinterpret_s32_f32(a), svreinterpret_s32_f32(b))); +} + +template <> +Vectorized inline operator|( + const Vectorized& a, + const Vectorized& b) { + return svreinterpret_f32_s32( + svorr_s32_x(ptrue, svreinterpret_s32_f32(a), svreinterpret_s32_f32(b))); +} + +template <> +Vectorized inline operator^( + const Vectorized& a, + const Vectorized& b) { + return svreinterpret_f32_s32( + sveor_s32_x(ptrue, svreinterpret_s32_f32(a), svreinterpret_s32_f32(b))); +} + +Vectorized inline Vectorized::eq( + const Vectorized& other) const { + return (*this == other) & Vectorized(1.0f); +} + +Vectorized inline Vectorized::ne( + const Vectorized& other) const { + return (*this != other) & Vectorized(1.0f); +} + +Vectorized inline Vectorized::gt( + const Vectorized& other) const { + return (*this > other) & Vectorized(1.0f); +} + +Vectorized inline Vectorized::ge( + const Vectorized& other) const { + return (*this >= other) & Vectorized(1.0f); +} + +Vectorized inline Vectorized::lt( + const Vectorized& other) const { + return (*this < other) & Vectorized(1.0f); +} + +Vectorized inline Vectorized::le( + const Vectorized& other) const { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return (*this <= other) & Vectorized(1.0f); } @@ -514,33 +986,52 @@ inline void convert(const float* src, float* dst, int64_t n) { } template <> +<<<<<<< HEAD inline void convert(const float *src, at::Half *dst, int64_t n) { +======= +inline void convert(const float* src, at::Half* dst, int64_t n) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) const int64_t fraction = n % Vectorized::size(); svbool_t pg_16 = svwhilelt_b16(0ull, Vectorized::size()); svbool_t pg_32 = svwhilelt_b32(0ull, Vectorized::size()); #pragma unroll for (int64_t i = 0; i < n - fraction; i += Vectorized::size()) { +<<<<<<< HEAD svfloat16_t src_vec = svuzp1_f16(svcvt_f16_f32_x(ptrue, svldnt1_f32(pg_32, src + i)), ZERO_F16); +======= + svfloat16_t src_vec = svuzp1_f16( + svcvt_f16_f32_x(ptrue, svldnt1_f32(pg_32, src + i)), ZERO_F16); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) svst1_f16(pg_16, reinterpret_cast(dst) + i, src_vec); } #pragma unroll for (int64_t i = n - fraction; i < n; i += Vectorized::size()) { pg_16 = svwhilelt_b16(i, n); pg_32 = svwhilelt_b32(i, n); +<<<<<<< HEAD svfloat16_t src_vec = svuzp1_f16(svcvt_f16_f32_x(ptrue, svldnt1_f32(pg_32, src + i)), ZERO_F16); +======= + svfloat16_t src_vec = svuzp1_f16( + svcvt_f16_f32_x(ptrue, svldnt1_f32(pg_32, src + i)), ZERO_F16); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) svst1_f16(pg_16, reinterpret_cast(dst) + i, src_vec); } } template <> +<<<<<<< HEAD inline void convert(const at::Half *src, float *dst, int64_t n) { +======= +inline void convert(const at::Half* src, float* dst, int64_t n) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) const int64_t fraction = n % Vectorized::size(); svbool_t pg_16 = svwhilelt_b16(0ull, Vectorized::size()); svbool_t pg_32 = svwhilelt_b32(0ull, Vectorized::size()); #pragma unroll for (int64_t i = 0; i < n - fraction; i += Vectorized::size()) { +<<<<<<< HEAD svfloat16_t src_vec = svzip1_f16(svldnt1_f16(pg_16, reinterpret_cast(src) + i), ZERO_F16); svst1_f32(pg_32, dst + i, svcvt_f32_f16_x(ptrue, src_vec)); @@ -551,18 +1042,41 @@ inline void convert(const at::Half *src, float *dst, int64_t n) { pg_32 = svwhilelt_b32(i, n); svfloat16_t src_vec = svzip1_f16(svldnt1_f16(pg_16, reinterpret_cast(src) + i), ZERO_F16); +======= + svfloat16_t src_vec = svzip1_f16( + svldnt1_f16(pg_16, reinterpret_cast(src) + i), + ZERO_F16); + svst1_f32(pg_32, dst + i, svcvt_f32_f16_x(ptrue, src_vec)); + } +#pragma unroll + for (int64_t i = n - fraction; i < n; i += Vectorized::size()) { + pg_16 = svwhilelt_b16(i, n); + pg_32 = svwhilelt_b32(i, n); + svfloat16_t src_vec = svzip1_f16( + svldnt1_f16(pg_16, reinterpret_cast(src) + i), + ZERO_F16); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) svst1_f32(pg_32, dst + i, svcvt_f32_f16_x(ptrue, src_vec)); } } template <> +<<<<<<< HEAD inline void convert(const bool *src, float *dst, int64_t n) { +======= +inline void convert(const bool* src, float* dst, int64_t n) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) const int64_t fraction = n % Vectorized::size(); svbool_t pg_8 = svwhilelt_b8(0ull, Vectorized::size()); svbool_t pg_32 = svwhilelt_b32(0ull, Vectorized::size()); #pragma unroll for (int64_t i = 0; i < n - fraction; i += Vectorized::size()) { +<<<<<<< HEAD svuint8_t src_vec_u8 = svldnt1_u8(pg_8, reinterpret_cast(src) + i); +======= + svuint8_t src_vec_u8 = + svldnt1_u8(pg_8, reinterpret_cast(src) + i); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) svuint32_t src_vec_u32 = svunpklo_u32(svunpklo_u16(src_vec_u8)); svbool_t mask = svcmpne_u32(pg_32, src_vec_u32, ZERO_U32); svst1_f32(pg_32, dst + i, svsel_f32(mask, ONE_F32, ZERO_F32)); @@ -571,7 +1085,12 @@ inline void convert(const bool *src, float *dst, int64_t n) { for (int64_t i = n - fraction; i < n; i += Vectorized::size()) { pg_8 = svwhilelt_b8(i, n); pg_32 = svwhilelt_b32(i, n); +<<<<<<< HEAD svuint8_t src_vec_u8 = svldnt1_u8(pg_8, reinterpret_cast(src) + i); +======= + svuint8_t src_vec_u8 = + svldnt1_u8(pg_8, reinterpret_cast(src) + i); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) svuint32_t src_vec_u32 = svunpklo_u32(svunpklo_u16(src_vec_u8)); svbool_t mask = svcmpne_u32(pg_32, src_vec_u32, ZERO_U32); svst1_f32(pg_32, dst + i, svsel_f32(mask, ONE_F32, ZERO_F32)); @@ -579,10 +1098,22 @@ inline void convert(const bool *src, float *dst, int64_t n) { } template <> +<<<<<<< HEAD Vectorized inline fmadd(const Vectorized& a, const Vectorized& b, const Vectorized& c) { +======= +Vectorized inline fmadd( + const Vectorized& a, + const Vectorized& b, + const Vectorized& c) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return svmad_f32_x(ptrue, a, b, c); } #endif // defined(CPU_CAPABILITY_SVE) +<<<<<<< HEAD }} +======= +} // namespace CPU_CAPABILITY +} // namespace at::vec +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) diff --git a/aten/src/ATen/cpu/vec/sve/vec_int.h b/aten/src/ATen/cpu/vec/sve/vec_int.h index 1e8c76ab0572..03c26a6ac909 100644 --- a/aten/src/ATen/cpu/vec/sve/vec_int.h +++ b/aten/src/ATen/cpu/vec/sve/vec_int.h @@ -1,9 +1,14 @@ #pragma once #include +<<<<<<< HEAD #include #include +======= +#include +#include +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) namespace at::vec { // Note [CPU_CAPABILITY namespace] @@ -18,6 +23,7 @@ inline namespace CPU_CAPABILITY { #if defined(CPU_CAPABILITY_SVE) +<<<<<<< HEAD #define VEC_INT_SVE_TEMPLATE(vl, bit) \ template <> class Vectorized { \ private: \ @@ -217,6 +223,247 @@ Vectorized inline Vectorized::lt(const Vectorized inline Vectorized::le(const Vectorized& other) const { \ return (*this <= other) & Vectorized(1); \ } +======= +#define VEC_INT_SVE_TEMPLATE(vl, bit) \ + template <> \ + struct is_vec_specialized_for : std::bool_constant {}; \ + \ + template <> \ + class Vectorized { \ + private: \ + vls_int##bit##_t values; \ + \ + public: \ + using value_type = int##bit##_t; \ + using size_type = int; \ + static constexpr size_type size() { \ + return vl; \ + } \ + Vectorized() {} \ + Vectorized(svint##bit##_t v) : values(v) {} \ + Vectorized(int##bit##_t val) { \ + values = svdup_n_s##bit(val); \ + } \ + template < \ + typename... Args, \ + typename = std::enable_if_t<(sizeof...(Args) == size())>> \ + Vectorized(Args... vals) { \ + __at_align__ int##bit##_t buffer[size()] = {vals...}; \ + values = svld1_s##bit(ptrue, buffer); \ + } \ + operator svint##bit##_t() const { \ + return values; \ + } \ + template \ + static Vectorized blend( \ + const Vectorized& a, \ + const Vectorized& b) { \ + __at_align__ int##bit##_t flag_arr[size()]; \ + for (int i = 0; i < size(); ++i) { \ + flag_arr[i] = (i < 64 && (mask & (1ULL << i))) ? 1 : 0; \ + } \ + svbool_t blend_mask = svcmpne_n_s##bit( \ + svptrue_b##bit(), svld1_s##bit(svptrue_b##bit(), flag_arr), 0); \ + return Vectorized( \ + svsel_s##bit(blend_mask, b.values, a.values)); \ + } \ + static Vectorized blendv( \ + const Vectorized& a, \ + const Vectorized& b, \ + const Vectorized& mask_) { \ + svbool_t mask = svcmpeq_s##bit(ptrue, mask_, ALL_S##bit##_TRUE_MASK); \ + return svsel_s##bit(mask, b, a); \ + } \ + /* step sometimes requires a higher precision type (e.g., T=int, \ + * step_t=double) */ \ + template \ + static Vectorized arange( \ + int##bit##_t base = 0, \ + step_t step = static_cast(1)) { \ + __at_align__ int##bit##_t buffer[size()]; \ + for (int64_t i = 0; i < size(); i++) { \ + buffer[i] = base + i * step; \ + } \ + return svld1_s##bit(ptrue, buffer); \ + } \ + static Vectorized set( \ + const Vectorized& a, \ + const Vectorized& b, \ + int##bit##_t count = size()) { \ + if (count == 0) { \ + return a; \ + } else if (count < size()) { \ + return svsel_s##bit(svwhilelt_b##bit(0ull, count), b, a); \ + } \ + return b; \ + } \ + static Vectorized loadu( \ + const void* ptr, \ + int64_t count = size()) { \ + if (count == size()) \ + return svld1_s##bit( \ + ptrue, reinterpret_cast(ptr)); \ + svbool_t pg = svwhilelt_b##bit(0ull, count); \ + return svld1_s##bit(pg, reinterpret_cast(ptr)); \ + } \ + void store(void* ptr, int64_t count = size()) const { \ + if (count == size()) { \ + svst1_s##bit(ptrue, reinterpret_cast(ptr), values); \ + } else { \ + svbool_t pg = svwhilelt_b##bit(0ull, count); \ + svst1_s##bit(pg, reinterpret_cast(ptr), values); \ + } \ + } \ + const int##bit##_t& operator[](int idx) const = delete; \ + int##bit##_t& operator[](int idx) = delete; \ + Vectorized abs() const { \ + return svabs_s##bit##_x(ptrue, values); \ + } \ + Vectorized real() const { \ + return values; \ + } \ + Vectorized imag() const { \ + return svdup_n_s##bit(0); \ + } \ + Vectorized conj() const { \ + return values; \ + } \ + Vectorized frac() const; \ + Vectorized neg() const { \ + return svneg_s##bit##_x(ptrue, values); \ + } \ + Vectorized operator==( \ + const Vectorized& other) const { \ + svbool_t mask = svcmpeq_s##bit(ptrue, values, other); \ + return svsel_s##bit( \ + mask, ALL_S##bit##_TRUE_MASK, ALL_S##bit##_FALSE_MASK); \ + } \ + Vectorized operator!=( \ + const Vectorized& other) const { \ + svbool_t mask = svcmpne_s##bit(ptrue, values, other); \ + return svsel_s##bit( \ + mask, ALL_S##bit##_TRUE_MASK, ALL_S##bit##_FALSE_MASK); \ + } \ + Vectorized operator<( \ + const Vectorized& other) const { \ + svbool_t mask = svcmplt_s##bit(ptrue, values, other); \ + return svsel_s##bit( \ + mask, ALL_S##bit##_TRUE_MASK, ALL_S##bit##_FALSE_MASK); \ + } \ + Vectorized operator<=( \ + const Vectorized& other) const { \ + svbool_t mask = svcmple_s##bit(ptrue, values, other); \ + return svsel_s##bit( \ + mask, ALL_S##bit##_TRUE_MASK, ALL_S##bit##_FALSE_MASK); \ + } \ + Vectorized operator>( \ + const Vectorized& other) const { \ + svbool_t mask = svcmpgt_s##bit(ptrue, values, other); \ + return svsel_s##bit( \ + mask, ALL_S##bit##_TRUE_MASK, ALL_S##bit##_FALSE_MASK); \ + } \ + Vectorized operator>=( \ + const Vectorized& other) const { \ + svbool_t mask = svcmpge_s##bit(ptrue, values, other); \ + return svsel_s##bit( \ + mask, ALL_S##bit##_TRUE_MASK, ALL_S##bit##_FALSE_MASK); \ + } \ + Vectorized eq(const Vectorized& other) const; \ + Vectorized ne(const Vectorized& other) const; \ + Vectorized gt(const Vectorized& other) const; \ + Vectorized ge(const Vectorized& other) const; \ + Vectorized lt(const Vectorized& other) const; \ + Vectorized le(const Vectorized& other) const; \ + }; \ + template <> \ + Vectorized inline operator+( \ + const Vectorized& a, const Vectorized& b) { \ + return svadd_s##bit##_x(ptrue, a, b); \ + } \ + template <> \ + Vectorized inline operator-( \ + const Vectorized& a, const Vectorized& b) { \ + return svsub_s##bit##_x(ptrue, a, b); \ + } \ + template <> \ + Vectorized inline operator*( \ + const Vectorized& a, const Vectorized& b) { \ + return svmul_s##bit##_x(ptrue, a, b); \ + } \ + template <> \ + Vectorized inline maximum( \ + const Vectorized& a, const Vectorized& b) { \ + return svmax_s##bit##_x(ptrue, a, b); \ + } \ + template <> \ + Vectorized inline minimum( \ + const Vectorized& a, const Vectorized& b) { \ + return svmin_s##bit##_x(ptrue, a, b); \ + } \ + template <> \ + Vectorized inline clamp( \ + const Vectorized& a, \ + const Vectorized& min, \ + const Vectorized& max) { \ + return svmin_s##bit##_x(ptrue, max, svmax_s##bit##_x(ptrue, min, a)); \ + } \ + template <> \ + Vectorized inline clamp_max( \ + const Vectorized& a, \ + const Vectorized& max) { \ + return svmin_s##bit##_x(ptrue, max, a); \ + } \ + template <> \ + Vectorized inline clamp_min( \ + const Vectorized& a, \ + const Vectorized& min) { \ + return svmax_s##bit##_x(ptrue, min, a); \ + } \ + template <> \ + Vectorized inline operator&( \ + const Vectorized& a, const Vectorized& b) { \ + return svand_s##bit##_x(ptrue, a, b); \ + } \ + template <> \ + Vectorized inline operator|( \ + const Vectorized& a, const Vectorized& b) { \ + return svorr_s##bit##_x(ptrue, a, b); \ + } \ + template <> \ + Vectorized inline operator^( \ + const Vectorized& a, const Vectorized& b) { \ + return sveor_s##bit##_x(ptrue, a, b); \ + } \ + template <> \ + inline Vectorized operator~( \ + const Vectorized& a) { \ + return sveor_s##bit##_x(ptrue, a, svdup_n_s##bit(-1)); \ + } \ + Vectorized inline Vectorized::eq( \ + const Vectorized& other) const { \ + return (*this == other) & Vectorized(1); \ + } \ + Vectorized inline Vectorized::ne( \ + const Vectorized& other) const { \ + return (*this != other) & Vectorized(1); \ + } \ + Vectorized inline Vectorized::gt( \ + const Vectorized& other) const { \ + return (*this > other) & Vectorized(1); \ + } \ + Vectorized inline Vectorized::ge( \ + const Vectorized& other) const { \ + return (*this >= other) & Vectorized(1); \ + } \ + Vectorized inline Vectorized::lt( \ + const Vectorized& other) const { \ + return (*this < other) & Vectorized(1); \ + } \ + Vectorized inline Vectorized::le( \ + const Vectorized& other) const { \ + return (*this <= other) & Vectorized(1); \ + } +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) VEC_INT_SVE_TEMPLATE(VECTOR_WIDTH / sizeof(int64_t), 64) VEC_INT_SVE_TEMPLATE(VECTOR_WIDTH / sizeof(int32_t), 32) @@ -224,7 +471,13 @@ VEC_INT_SVE_TEMPLATE(VECTOR_WIDTH / sizeof(int16_t), 16) VEC_INT_SVE_TEMPLATE(VECTOR_WIDTH / sizeof(int8_t), 8) template +<<<<<<< HEAD Vectorized inline intdiv_nosve(const Vectorized& a, const Vectorized& b) { +======= +Vectorized inline intdiv_nosve( + const Vectorized& a, + const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) T values_a[Vectorized::size()]; T values_b[Vectorized::size()]; a.store(values_a); @@ -236,27 +489,55 @@ Vectorized inline intdiv_nosve(const Vectorized& a, const Vectorized& b } template <> +<<<<<<< HEAD Vectorized inline operator/(const Vectorized& a, const Vectorized& b) { +======= +Vectorized inline operator/( + const Vectorized& a, + const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return svdiv_s64_x(ptrue, a, b); } template <> +<<<<<<< HEAD Vectorized inline operator/(const Vectorized& a, const Vectorized& b) { +======= +Vectorized inline operator/( + const Vectorized& a, + const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return svdiv_s32_x(ptrue, a, b); } template <> +<<<<<<< HEAD Vectorized inline operator/(const Vectorized& a, const Vectorized& b) { +======= +Vectorized inline operator/( + const Vectorized& a, + const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return intdiv_nosve(a, b); } template <> +<<<<<<< HEAD Vectorized inline operator/(const Vectorized& a, const Vectorized& b) { +======= +Vectorized inline operator/( + const Vectorized& a, + const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return intdiv_nosve(a, b); } template <> +<<<<<<< HEAD inline void convert(const int32_t *src, int64_t *dst, int64_t n) { +======= +inline void convert(const int32_t* src, int64_t* dst, int64_t n) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) const int64_t fraction = n % Vectorized::size(); svbool_t pg_32 = svwhilelt_b32(0ull, Vectorized::size()); svbool_t pg_64 = svwhilelt_b64(0ull, Vectorized::size()); @@ -272,14 +553,23 @@ inline void convert(const int32_t *src, int64_t *dst, int64_t n) { } template <> +<<<<<<< HEAD inline void convert(const int64_t *src, float *dst, int64_t n) { +======= +inline void convert(const int64_t* src, float* dst, int64_t n) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) const int64_t fraction = n % Vectorized::size(); svbool_t pg_32 = svwhilelt_b32(0ull, Vectorized::size()); svbool_t pg_64 = svwhilelt_b64(0ull, Vectorized::size()); #pragma unroll for (int64_t i = 0; i < n - fraction; i += Vectorized::size()) { svint64_t src_vec_s64 = svldnt1_s64(pg_64, src + i); +<<<<<<< HEAD svfloat32_t src_vec_f32 = svuzp1_f32(svcvt_f32_s64_x(pg_64, src_vec_s64), ZERO_F32); +======= + svfloat32_t src_vec_f32 = + svuzp1_f32(svcvt_f32_s64_x(pg_64, src_vec_s64), ZERO_F32); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) svst1_f32(pg_32, dst + i, src_vec_f32); } #pragma unroll @@ -287,13 +577,22 @@ inline void convert(const int64_t *src, float *dst, int64_t n) { pg_32 = svwhilelt_b32(i, n); pg_64 = svwhilelt_b64(i, n); svint64_t src_vec_s64 = svldnt1_s64(pg_64, src + i); +<<<<<<< HEAD svfloat32_t src_vec_f32 = svuzp1_f32(svcvt_f32_s64_x(pg_64, src_vec_s64), ZERO_F32); +======= + svfloat32_t src_vec_f32 = + svuzp1_f32(svcvt_f32_s64_x(pg_64, src_vec_s64), ZERO_F32); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) svst1_f32(pg_32, dst + i, src_vec_f32); } } template <> +<<<<<<< HEAD inline void convert(const int32_t *src, float *dst, int64_t n) { +======= +inline void convert(const int32_t* src, float* dst, int64_t n) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) const int64_t fraction = n % Vectorized::size(); svbool_t pg = svwhilelt_b32(0ull, Vectorized::size()); #pragma unroll @@ -310,14 +609,25 @@ inline void convert(const int32_t *src, float *dst, int64_t n) { } template <> +<<<<<<< HEAD inline void convert(const bool *src, int64_t *dst, int64_t n) { +======= +inline void convert(const bool* src, int64_t* dst, int64_t n) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) const int64_t fraction = n % Vectorized::size(); svbool_t pg_8 = svwhilelt_b8(0ull, Vectorized::size()); svbool_t pg_64 = svwhilelt_b64(0ull, Vectorized::size()); #pragma unroll for (int64_t i = 0; i < n - fraction; i += Vectorized::size()) { +<<<<<<< HEAD svuint8_t src_vec_u8 = svldnt1_u8(pg_8, reinterpret_cast(src) + i); svuint64_t src_vec_u64 = svunpklo_u64(svunpklo_u32(svunpklo_u16(src_vec_u8))); +======= + svuint8_t src_vec_u8 = + svldnt1_u8(pg_8, reinterpret_cast(src) + i); + svuint64_t src_vec_u64 = + svunpklo_u64(svunpklo_u32(svunpklo_u16(src_vec_u8))); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) svbool_t mask = svcmpne_u64(pg_64, src_vec_u64, ZERO_U64); svst1_s64(pg_64, dst + i, svsel_s64(mask, ONE_S64, ZERO_S64)); } @@ -325,21 +635,37 @@ inline void convert(const bool *src, int64_t *dst, int64_t n) { for (int64_t i = n - fraction; i < n; i += Vectorized::size()) { pg_8 = svwhilelt_b8(i, n); pg_64 = svwhilelt_b64(i, n); +<<<<<<< HEAD svuint8_t src_vec_u8 = svldnt1_u8(pg_8, reinterpret_cast(src) + i); svuint64_t src_vec_u64 = svunpklo_u64(svunpklo_u32(svunpklo_u16(src_vec_u8))); +======= + svuint8_t src_vec_u8 = + svldnt1_u8(pg_8, reinterpret_cast(src) + i); + svuint64_t src_vec_u64 = + svunpklo_u64(svunpklo_u32(svunpklo_u16(src_vec_u8))); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) svbool_t mask = svcmpne_u64(pg_64, src_vec_u64, ZERO_U64); svst1_s64(pg_64, dst + i, svsel_s64(mask, ONE_S64, ZERO_S64)); } } template <> +<<<<<<< HEAD inline void convert(const bool *src, int32_t *dst, int64_t n) { +======= +inline void convert(const bool* src, int32_t* dst, int64_t n) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) const int64_t fraction = n % Vectorized::size(); svbool_t pg_8 = svwhilelt_b8(0ull, Vectorized::size()); svbool_t pg_32 = svwhilelt_b32(0ull, Vectorized::size()); #pragma unroll for (int64_t i = 0; i < n - fraction; i += Vectorized::size()) { +<<<<<<< HEAD svuint8_t src_vec_u8 = svldnt1_u8(pg_8, reinterpret_cast(src) + i); +======= + svuint8_t src_vec_u8 = + svldnt1_u8(pg_8, reinterpret_cast(src) + i); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) svuint32_t src_vec_u32 = svunpklo_u32(svunpklo_u16(src_vec_u8)); svbool_t mask = svcmpne_u32(pg_32, src_vec_u32, ZERO_U32); svst1_s32(pg_32, dst + i, svsel_s32(mask, ONE_S32, ZERO_S32)); @@ -348,7 +674,12 @@ inline void convert(const bool *src, int32_t *dst, int64_t n) { for (int64_t i = n - fraction; i < n; i += Vectorized::size()) { pg_8 = svwhilelt_b8(i, n); pg_32 = svwhilelt_b32(i, n); +<<<<<<< HEAD svuint8_t src_vec_u8 = svldnt1_u8(pg_8, reinterpret_cast(src) + i); +======= + svuint8_t src_vec_u8 = + svldnt1_u8(pg_8, reinterpret_cast(src) + i); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) svuint32_t src_vec_u32 = svunpklo_u32(svunpklo_u16(src_vec_u8)); svbool_t mask = svcmpne_u32(pg_32, src_vec_u32, ZERO_U32); svst1_s32(pg_32, dst + i, svsel_s32(mask, ONE_S32, ZERO_S32)); @@ -356,64 +687,135 @@ inline void convert(const bool *src, int32_t *dst, int64_t n) { } template <> +<<<<<<< HEAD inline void convert(const uint8_t *src, bool *dst, int64_t n) { +======= +inline void convert(const uint8_t* src, bool* dst, int64_t n) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) const int64_t fraction = n % Vectorized::size(); svbool_t pg = svwhilelt_b8(0ull, Vectorized::size()); #pragma unroll for (int64_t i = 0; i < n - fraction; i += Vectorized::size()) { svbool_t mask = svcmpne_u8(pg, svldnt1_u8(pg, src + i), ZERO_U8); +<<<<<<< HEAD svst1_u8(pg, reinterpret_cast(dst) + i, svsel_u8(mask, ALL_U8_TRUE_MASK, ALL_U8_FALSE_MASK)); +======= + svst1_u8( + pg, + reinterpret_cast(dst) + i, + svsel_u8(mask, ALL_U8_TRUE_MASK, ALL_U8_FALSE_MASK)); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } #pragma unroll for (int64_t i = n - fraction; i < n; i += Vectorized::size()) { pg = svwhilelt_b8(i, n); svbool_t mask = svcmpne_u8(pg, svldnt1_u8(pg, src + i), ZERO_U8); +<<<<<<< HEAD svst1_u8(pg, reinterpret_cast(dst) + i, svsel_u8(mask, ALL_U8_TRUE_MASK, ALL_U8_FALSE_MASK)); +======= + svst1_u8( + pg, + reinterpret_cast(dst) + i, + svsel_u8(mask, ALL_U8_TRUE_MASK, ALL_U8_FALSE_MASK)); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } } template <> +<<<<<<< HEAD Vectorized inline operator<<(const Vectorized& a, const Vectorized& b) { +======= +Vectorized inline operator<<( + const Vectorized& a, + const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return svlsl_s64_x(ptrue, a, svreinterpret_u64_s64(b)); } template <> +<<<<<<< HEAD Vectorized inline operator<<(const Vectorized& a, const Vectorized& b) { +======= +Vectorized inline operator<<( + const Vectorized& a, + const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return svlsl_s32_x(ptrue, a, svreinterpret_u32_s32(b)); } template <> +<<<<<<< HEAD Vectorized inline operator<<(const Vectorized& a, const Vectorized& b) { +======= +Vectorized inline operator<<( + const Vectorized& a, + const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return svlsl_s16_x(ptrue, a, svreinterpret_u16_s16(b)); } template <> +<<<<<<< HEAD Vectorized inline operator<<(const Vectorized& a, const Vectorized& b) { +======= +Vectorized inline operator<<( + const Vectorized& a, + const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return svlsl_s8_x(ptrue, a, svreinterpret_u8_s8(b)); } template <> +<<<<<<< HEAD Vectorized inline operator>>(const Vectorized& a, const Vectorized& b) { +======= +Vectorized inline operator>>( + const Vectorized& a, + const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return svasr_s64_x(ptrue, a, svreinterpret_u64_s64(b)); } template <> +<<<<<<< HEAD Vectorized inline operator>>(const Vectorized& a, const Vectorized& b) { +======= +Vectorized inline operator>>( + const Vectorized& a, + const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return svasr_s32_x(ptrue, a, svreinterpret_u32_s32(b)); } template <> +<<<<<<< HEAD Vectorized inline operator>>(const Vectorized& a, const Vectorized& b) { +======= +Vectorized inline operator>>( + const Vectorized& a, + const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return svasr_s16_x(ptrue, a, svreinterpret_u16_s16(b)); } template <> +<<<<<<< HEAD Vectorized inline operator>>(const Vectorized& a, const Vectorized& b) { +======= +Vectorized inline operator>>( + const Vectorized& a, + const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return svasr_s8_x(ptrue, a, svreinterpret_u8_s8(b)); } #endif // defined(CPU_CAPABILITY_SVE) +<<<<<<< HEAD }} +======= +} // namespace CPU_CAPABILITY +} // namespace at::vec +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) diff --git a/aten/src/ATen/cpu/vec/sve/vec_qint.h b/aten/src/ATen/cpu/vec/sve/vec_qint.h index 96e201ef36a2..5f9172f60256 100644 --- a/aten/src/ATen/cpu/vec/sve/vec_qint.h +++ b/aten/src/ATen/cpu/vec/sve/vec_qint.h @@ -32,9 +32,14 @@ // specified by float_vec_return_type. // // When writing kernels with these vectors, it is expected that floating- +<<<<<<< HEAD // point operations will be carried out in a loop over Vectorized::float_num_vecs // iterations. +======= +// point operations will be carried out in a loop over +// Vectorized::float_num_vecs iterations. +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) namespace at::vec { // Note [CPU_CAPABILITY namespace] @@ -108,8 +113,15 @@ struct VectorizedQuantizedConverter { for (int i = 0; i < float_num_vecs(); ++i) { float tmp_vals[Vectorized::size()]; for (int j = 0; j < Vectorized::size(); ++j) { +<<<<<<< HEAD tmp_vals[j] = at::native::dequantize_val(tmp_scale[j], tmp_zero_point[j], T(vals[Vectorized::size() * i + j])); +======= + tmp_vals[j] = at::native::dequantize_val( + tmp_scale[j], + tmp_zero_point[j], + T(vals[Vectorized::size() * i + j])); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } rv[i] = Vectorized::loadu(tmp_vals); } @@ -127,8 +139,15 @@ struct VectorizedQuantizedConverter { for (int i = 0; i < float_num_vecs(); ++i) { float tmp_vals[Vectorized::size()]; for (int j = 0; j < Vectorized::size(); ++j) { +<<<<<<< HEAD tmp_vals[j] = at::native::dequantize_val(tmp_scale[j], tmp_zero_point[j], T(vals[Vectorized::size() * i + j])); +======= + tmp_vals[j] = at::native::dequantize_val( + tmp_scale[j], + tmp_zero_point[j], + T(vals[Vectorized::size() * i + j])); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } rv[i] = Vectorized::loadu(tmp_vals); } @@ -140,11 +159,22 @@ struct VectorizedQuantizedConverter { }; template <> +<<<<<<< HEAD struct Vectorized : public VectorizedQuantizedConverter< c10::qint32, std::array, 1>, std::array, 1>, VECTOR_WIDTH / 4> { +======= +struct is_vec_specialized_for : std::bool_constant {}; + +template <> +struct Vectorized : public VectorizedQuantizedConverter< + c10::qint32, + std::array, 1>, + std::array, 1>, + VECTOR_WIDTH / 4> { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) Vectorized() : VectorizedQuantizedConverter< c10::qint32, @@ -169,6 +199,7 @@ struct Vectorized : public VectorizedQuantizedConverter< } static Vectorized loadu(const void* ptr, int64_t count) { +<<<<<<< HEAD __at_align__ value_type tmp_values[size()]; // Ensure uninitialized memory does not change the output value See https://github.com/pytorch/pytorch/issues/32502 // for more details. We do not initialize arrays to zero using "={0}" because gcc would compile it to two @@ -181,6 +212,26 @@ struct Vectorized : public VectorizedQuantizedConverter< } #else static Vectorized loadu(const void* ptr, int64_t count = size()) { +======= + __at_align__ value_type tmp_values[size()]; + // Ensure uninitialized memory does not change the output value See + // https://github.com/pytorch/pytorch/issues/32502 for more details. We do + // not initialize arrays to zero using "={0}" because gcc would compile it + // to two instructions while a loop would be compiled to one instruction. + for (const auto i : c10::irange(size())) { + tmp_values[i] = 0; + } + std::memcpy( + tmp_values, + reinterpret_cast(ptr), + count * sizeof(value_type)); + return loadu(tmp_values); + } +#else + static Vectorized loadu( + const void* ptr, + int64_t count = size()) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if (count == size()) return svld1_s32(ptrue, reinterpret_cast(ptr)); svbool_t pg = svwhilelt_b32(0ull, count); @@ -196,7 +247,13 @@ struct Vectorized : public VectorizedQuantizedConverter< std::array::size()> float_vals; for (int i = 0; i < float_num_vecs(); ++i) { +<<<<<<< HEAD rhs[i].store(&float_vals[i * Vectorized::size()], Vectorized::size()); +======= + rhs[i].store( + &float_vals[i * Vectorized::size()], + Vectorized::size()); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } at::native::quantize_vec( @@ -225,11 +282,18 @@ struct Vectorized : public VectorizedQuantizedConverter< return retval; } +<<<<<<< HEAD Vectorized relu(Vectorized zero_point) const { return maximum(zero_point); } +======= + Vectorized relu(Vectorized zero_point) const { + return maximum(zero_point); + } + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) Vectorized relu6( Vectorized zero_point, Vectorized q_six) { @@ -264,7 +328,13 @@ struct Vectorized : public VectorizedQuantizedConverter< }; template <> +<<<<<<< HEAD Vectorized inline maximum(const Vectorized& a, const Vectorized& b) { +======= +Vectorized inline maximum( + const Vectorized& a, + const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return a.maximum(b); } @@ -291,11 +361,22 @@ Vectorized inline operator+( } template <> +<<<<<<< HEAD struct Vectorized : public VectorizedQuantizedConverter< c10::qint8, std::array, 4>, std::array, 4>, VECTOR_WIDTH> { +======= +struct is_vec_specialized_for : std::bool_constant {}; + +template <> +struct Vectorized : public VectorizedQuantizedConverter< + c10::qint8, + std::array, 4>, + std::array, 4>, + VECTOR_WIDTH> { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) Vectorized() : VectorizedQuantizedConverter< c10::qint8, @@ -320,6 +401,7 @@ struct Vectorized : public VectorizedQuantizedConverter< } static Vectorized loadu(const void* ptr, int64_t count) { +<<<<<<< HEAD __at_align__ value_type tmp_values[size()]; // Ensure uninitialized memory does not change the output value See https://github.com/pytorch/pytorch/issues/32502 // for more details. We do not initialize arrays to zero using "={0}" because gcc would compile it to two @@ -329,6 +411,21 @@ struct Vectorized : public VectorizedQuantizedConverter< } std::memcpy(tmp_values, reinterpret_cast(ptr), count * sizeof(value_type)); return loadu(tmp_values); +======= + __at_align__ value_type tmp_values[size()]; + // Ensure uninitialized memory does not change the output value See + // https://github.com/pytorch/pytorch/issues/32502 for more details. We do + // not initialize arrays to zero using "={0}" because gcc would compile it + // to two instructions while a loop would be compiled to one instruction. + for (const auto i : c10::irange(size())) { + tmp_values[i] = 0; + } + std::memcpy( + tmp_values, + reinterpret_cast(ptr), + count * sizeof(value_type)); + return loadu(tmp_values); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } static Vectorized quantize( @@ -340,7 +437,13 @@ struct Vectorized : public VectorizedQuantizedConverter< std::array::size()> float_vals; for (int i = 0; i < float_num_vecs(); ++i) { +<<<<<<< HEAD rhs[i].store(&float_vals[i * Vectorized::size()], Vectorized::size()); +======= + rhs[i].store( + &float_vals[i * Vectorized::size()], + Vectorized::size()); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } at::native::quantize_vec( @@ -418,16 +521,33 @@ struct Vectorized : public VectorizedQuantizedConverter< }; template <> +<<<<<<< HEAD Vectorized inline maximum(const Vectorized& a, const Vectorized& b) { +======= +Vectorized inline maximum( + const Vectorized& a, + const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return a.maximum(b); } template <> +<<<<<<< HEAD struct Vectorized : public VectorizedQuantizedConverter< c10::quint8, std::array, 4>, std::array, 4>, VECTOR_WIDTH> { +======= +struct is_vec_specialized_for : std::bool_constant {}; + +template <> +struct Vectorized : public VectorizedQuantizedConverter< + c10::quint8, + std::array, 4>, + std::array, 4>, + VECTOR_WIDTH> { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) Vectorized() : VectorizedQuantizedConverter< c10::quint8, @@ -452,6 +572,7 @@ struct Vectorized : public VectorizedQuantizedConverter< } static Vectorized loadu(const void* ptr, int64_t count) { +<<<<<<< HEAD __at_align__ value_type tmp_values[size()]; // Ensure uninitialized memory does not change the output value See https://github.com/pytorch/pytorch/issues/32502 // for more details. We do not initialize arrays to zero using "={0}" because gcc would compile it to two @@ -464,6 +585,26 @@ struct Vectorized : public VectorizedQuantizedConverter< } #else static Vectorized loadu(const void* ptr, int64_t count = size()) { +======= + __at_align__ value_type tmp_values[size()]; + // Ensure uninitialized memory does not change the output value See + // https://github.com/pytorch/pytorch/issues/32502 for more details. We do + // not initialize arrays to zero using "={0}" because gcc would compile it + // to two instructions while a loop would be compiled to one instruction. + for (const auto i : c10::irange(size())) { + tmp_values[i] = 0; + } + std::memcpy( + tmp_values, + reinterpret_cast(ptr), + count * sizeof(value_type)); + return loadu(tmp_values); + } +#else + static Vectorized loadu( + const void* ptr, + int64_t count = size()) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if (count == size()) return svld1_u8(ptrue, reinterpret_cast(ptr)); svbool_t pg = svwhilelt_b8(0ull, count); @@ -479,7 +620,13 @@ struct Vectorized : public VectorizedQuantizedConverter< std::array::size()> float_vals; for (int i = 0; i < float_num_vecs(); ++i) { +<<<<<<< HEAD rhs[i].store(&float_vals[i * Vectorized::size()], Vectorized::size()); +======= + rhs[i].store( + &float_vals[i * Vectorized::size()], + Vectorized::size()); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } at::native::quantize_vec( @@ -512,7 +659,10 @@ struct Vectorized : public VectorizedQuantizedConverter< return maximum(zero_point); } +<<<<<<< HEAD +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) Vectorized relu6( Vectorized zero_point, Vectorized q_six) { @@ -558,10 +708,21 @@ struct Vectorized : public VectorizedQuantizedConverter< }; template <> +<<<<<<< HEAD Vectorized inline maximum(const Vectorized& a, const Vectorized& b) { +======= +Vectorized inline maximum( + const Vectorized& a, + const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return a.maximum(b); } #endif // defined(CPU_CAPABILITY_SVE) +<<<<<<< HEAD }} +======= +} // namespace CPU_CAPABILITY +} // namespace at::vec +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) diff --git a/aten/src/ATen/cpu/vec/vec.h b/aten/src/ATen/cpu/vec/vec.h index e4b0c4b95d84..6e2ea59d6a1e 100644 --- a/aten/src/ATen/cpu/vec/vec.h +++ b/aten/src/ATen/cpu/vec/vec.h @@ -28,12 +28,19 @@ inline Vectorized Vectorized::loadu(const void* ptr) { } template <> +<<<<<<< HEAD inline Vectorized Vectorized::loadu(const void* ptr, int64_t count) { +======= +inline Vectorized Vectorized::loadu( + const void* ptr, + int64_t count) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // See NOTE [Loading boolean values] return convert_to_bool(Vectorized::loadu(ptr, count)); } template +<<<<<<< HEAD struct VecHoldType { using hold_type = typename VT::value_type; }; template <> @@ -41,8 +48,28 @@ struct VecHoldType> { using hold_type = BFloat16; }; template <> struct VecHoldType> {using hold_type = Half; }; +======= +struct VecHoldType { + using hold_type = typename VT::value_type; +}; + +template <> +struct VecHoldType> { + using hold_type = BFloat16; +}; + +template <> +struct VecHoldType> { + using hold_type = Half; +}; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) template using vechold_type = typename VecHoldType::hold_type; +<<<<<<< HEAD }} // namespace at::vec::CPU_CAPABILITY +======= +} // namespace CPU_CAPABILITY +} // namespace at::vec +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) diff --git a/aten/src/ATen/cpu/vec/vec128/vec128_bfloat16_neon.h b/aten/src/ATen/cpu/vec/vec128/vec128_bfloat16_neon.h index 7d594c696f7a..b476c31e9a95 100644 --- a/aten/src/ATen/cpu/vec/vec128/vec128_bfloat16_neon.h +++ b/aten/src/ATen/cpu/vec/vec128/vec128_bfloat16_neon.h @@ -11,8 +11,12 @@ namespace at::vec { // See Note [CPU_CAPABILITY namespace] +<<<<<<< HEAD inline namespace CPU_CAPABILITY { +======= +inline namespace CPU_CAPABILITY { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // Following vec128_half_neon.h, we only support aarch64. #if !defined(C10_MOBILE) && defined(__aarch64__) @@ -26,6 +30,7 @@ namespace CPU_CAPABILITY { // definitions in case they are actually there!). (See // https://godbolt.org/z/orv6e94n4 ) So, we need to handle it as // uint16_t in that case. +<<<<<<< HEAD #define IMPLEMENT_AT_BF16_SHIM(vec_suffix) \ inline at_bfloat16x4_t at_vget_low_bf16( \ at_bfloat16x8_t a) { \ @@ -90,6 +95,65 @@ namespace CPU_CAPABILITY { } else { \ return vreinterpret_u16_bf16(val); \ } \ +======= +#define IMPLEMENT_AT_BF16_SHIM(vec_suffix) \ + inline at_bfloat16x4_t at_vget_low_bf16(at_bfloat16x8_t a) { \ + return vget_low_##vec_suffix(a); \ + } \ + \ + inline at_bfloat16x4_t at_vget_high_bf16(at_bfloat16x8_t a) { \ + return vget_high_##vec_suffix(a); \ + } \ + \ + inline at_bfloat16x8_t at_vcombine_bf16( \ + at_bfloat16x4_t low, at_bfloat16x4_t high) { \ + return vcombine_##vec_suffix(low, high); \ + } \ + \ + inline at_bfloat16x8_t at_vdupq_n_bf16(at_bfloat16_t value) { \ + return vdupq_n_##vec_suffix(value); \ + } \ + \ + inline at_bfloat16x8_t at_vld1q_bf16(const at_bfloat16_t* ptr) { \ + return vld1q_##vec_suffix(ptr); \ + } \ + \ + inline void at_vst1q_bf16(at_bfloat16_t* ptr, at_bfloat16x8_t value) { \ + vst1q_##vec_suffix(ptr, value); \ + } \ + \ + template \ + inline at_bfloat16x8_t at_vreinterpretq_bf16_u16(T val) { \ + if constexpr (std::is_same_v) { \ + return val; \ + } else { \ + return vreinterpretq_bf16_u16(val); \ + } \ + } \ + template \ + inline at_bfloat16x4_t at_vreinterpret_bf16_u16(T val) { \ + if constexpr (std::is_same_v) { \ + return val; \ + } else { \ + return vreinterpret_bf16_u16(val); \ + } \ + } \ + template \ + inline uint16x8_t at_vreinterpretq_u16_bf16(T val) { \ + if constexpr (std::is_same_v) { \ + return val; \ + } else { \ + return vreinterpretq_u16_bf16(val); \ + } \ + } \ + template \ + inline uint16x4_t at_vreinterpret_u16_bf16(T val) { \ + if constexpr (std::is_same_v) { \ + return val; \ + } else { \ + return vreinterpret_u16_bf16(val); \ + } \ +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } #ifdef __ARM_FEATURE_BF16 @@ -137,11 +201,34 @@ struct BlendBFloat16Regs { }; template <> +<<<<<<< HEAD class Vectorized : public Vectorized16> { using Base = Vectorized16>; friend Base; friend std::tuple, Vectorized> convert_bfloat16_float(const Vectorized& a); friend Vectorized convert_float_bfloat16(const Vectorized& a, const Vectorized& b); +======= +struct is_vec_specialized_for : std::bool_constant {}; + +template <> +class Vectorized : public Vectorized16< + at_bfloat16x8_t, + c10::BFloat16, + BlendBFloat16Regs, + Vectorized> { + using Base = Vectorized16< + at_bfloat16x8_t, + c10::BFloat16, + BlendBFloat16Regs, + Vectorized>; + friend Base; + friend std::tuple, Vectorized> convert_bfloat16_float( + const Vectorized& a); + friend Vectorized convert_float_bfloat16( + const Vectorized& a, + const Vectorized& b); + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) private: Vectorized map2( const Vectorized& second, @@ -171,10 +258,21 @@ class Vectorized : public Vectorized16); uint32x4_t as_uint32 = vreinterpretq_u32_f32(f32); +<<<<<<< HEAD uint32x4_t rounding_bias = vaddq_u32(vandq_u32(vshrq_n_u32(as_uint32, 16), vdupq_n_u32(1)), vdupq_n_u32(0x7FFF)); at_bfloat16x4_t rounded = vshrn_n_u32(vaddq_u32(as_uint32, rounding_bias), 16); const auto bf16_nan = vdup_n_u16(0x7FC0); return vbsl_u16(vmovn_u32(vreinterpretq_u32_f32(f32.isnan())), bf16_nan, rounded); +======= + uint32x4_t rounding_bias = vaddq_u32( + vandq_u32(vshrq_n_u32(as_uint32, 16), vdupq_n_u32(1)), + vdupq_n_u32(0x7FFF)); + at_bfloat16x4_t rounded = + vshrn_n_u32(vaddq_u32(as_uint32, rounding_bias), 16); + const auto bf16_nan = vdup_n_u16(0x7FC0); + return vbsl_u16( + vmovn_u32(vreinterpretq_u32_f32(f32.isnan())), bf16_nan, rounded); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #endif // __ARM_FEATURE_BF16 } @@ -215,9 +313,18 @@ class Vectorized : public Vectorized16 mv0 = (Vectorized(v00).*m)(second_v00); Vectorized mv1 = (Vectorized(v01).*m)(second_v01); // Assume the operator returns a bitmask, not "real" floats, and +<<<<<<< HEAD // just narrow the bits. All-ones is a NaN and will get mangled by conversion! at_bfloat16x4_t r00 = at_vreinterpret_bf16_u16(vmovn_u32(vreinterpretq_u32_f32(mv0))); at_bfloat16x4_t r01 = at_vreinterpret_bf16_u16(vmovn_u32(vreinterpretq_u32_f32(mv1))); +======= + // just narrow the bits. All-ones is a NaN and will get mangled by + // conversion! + at_bfloat16x4_t r00 = + at_vreinterpret_bf16_u16(vmovn_u32(vreinterpretq_u32_f32(mv0))); + at_bfloat16x4_t r01 = + at_vreinterpret_bf16_u16(vmovn_u32(vreinterpretq_u32_f32(mv1))); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return Vectorized(at_vcombine_bf16(r00, r01)); } @@ -226,7 +333,12 @@ class Vectorized : public Vectorized16(val.x))) {} +======= + Vectorized(c10::BFloat16 val) + : Vectorized16(at_vdupq_n_bf16(c10::bit_cast(val.x))) {} +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) Vectorized(float val) : Vectorized(c10::BFloat16(val)) {} Vectorized( value_type val0, @@ -238,6 +350,7 @@ class Vectorized : public Vectorized16(val0.x), c10::bit_cast(val1.x), c10::bit_cast(val2.x), @@ -247,11 +360,22 @@ class Vectorized : public Vectorized16(val6.x), c10::bit_cast(val7.x)}) {} +======= + c10::bit_cast(val0.x), + c10::bit_cast(val1.x), + c10::bit_cast(val2.x), + c10::bit_cast(val3.x), + c10::bit_cast(val4.x), + c10::bit_cast(val5.x), + c10::bit_cast(val6.x), + c10::bit_cast(val7.x)}) {} +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) static Vectorized blendv( const Vectorized& a, const Vectorized& b, const Vectorized& mask) { +<<<<<<< HEAD // NOTE: blendv has the same problems as it does for Half; see comments in vec128_half_neon.h. Vectorized vec(mask.values); vec.values = at_vreinterpretq_bf16_u16( @@ -259,6 +383,15 @@ class Vectorized : public Vectorized16 vec(mask.values); + vec.values = at_vreinterpretq_bf16_u16(vbslq_u16( + at_vreinterpretq_u16_bf16(vec.values), + at_vreinterpretq_u16_bf16(b.values), + at_vreinterpretq_u16_bf16(a.values))); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return vec; } static Vectorized set( @@ -271,6 +404,7 @@ class Vectorized : public Vectorized16 vec( at_vreinterpretq_bf16_u16( vbslq_u16( @@ -281,6 +415,18 @@ class Vectorized : public Vectorized16 loadu(const void* ptr, int64_t count = size()) { +======= + Vectorized vec(at_vreinterpretq_bf16_u16(vbslq_u16( + mask, + at_vreinterpretq_u16_bf16(b.values), + at_vreinterpretq_u16_bf16(a.values)))); + + return vec; + } + static Vectorized loadu( + const void* ptr, + int64_t count = size()) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if (count == size()) { return at_vld1q_bf16(reinterpret_cast(ptr)); } @@ -332,9 +478,16 @@ class Vectorized : public Vectorized16::name); \ } +<<<<<<< HEAD #define DEFINE_BINARY_COMPARISON_OPERATOR_VIA_FLOAT_METHOD(name) \ Vectorized name(const Vectorized& other) const { \ return map2_bitmask_with_vec_float_method(other, &Vectorized::name); \ +======= +#define DEFINE_BINARY_COMPARISON_OPERATOR_VIA_FLOAT_METHOD(name) \ + Vectorized name(const Vectorized& other) const { \ + return map2_bitmask_with_vec_float_method( \ + other, &Vectorized::name); \ +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } DEFINE_UNARY_ELEMENTWISE_FUNC_VIA_FLOAT_METHOD(abs) @@ -361,6 +514,7 @@ class Vectorized : public Vectorized16 +<<<<<<< HEAD inline std::tuple, Vectorized> convert_bfloat16_float(const Vectorized& a) { static_assert(Vectorized::size() == 2 * Vectorized::size()); at_bfloat16x8_t x = a; @@ -370,6 +524,24 @@ inline std::tuple, Vectorized> convert_bfloat16_float(c } inline Vectorized convert_float_bfloat16(const Vectorized& a, const Vectorized& b) { static_assert(Vectorized::size() == 2 * Vectorized::size()); +======= +inline std::tuple, Vectorized> convert_bfloat16_float( + const Vectorized& a) { + static_assert( + Vectorized::size() == 2 * Vectorized::size()); + at_bfloat16x8_t x = a; + float32x4_t x1 = + Vectorized::convert_f32_bf16(at_vget_low_bf16(x)); + float32x4_t x2 = + Vectorized::convert_f32_bf16(at_vget_high_bf16(x)); + return {Vectorized(x1), Vectorized(x2)}; +} +inline Vectorized convert_float_bfloat16( + const Vectorized& a, + const Vectorized& b) { + static_assert( + Vectorized::size() == 2 * Vectorized::size()); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) at_bfloat16x4_t x1 = Vectorized::convert_bf16_f32(a); at_bfloat16x4_t x2 = Vectorized::convert_bf16_f32(b); return Vectorized(at_vcombine_bf16(x1, x2)); @@ -383,8 +555,12 @@ Vectorized binary_operator_via_float( const auto [a_float_low, a_float_high] = convert_bfloat16_float(a); const auto [b_float_low, b_float_high] = convert_bfloat16_float(b); return convert_float_bfloat16( +<<<<<<< HEAD op(a_float_low, b_float_low), op(a_float_high, b_float_high)); +======= + op(a_float_low, b_float_low), op(a_float_high, b_float_high)); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } template <> @@ -425,7 +601,12 @@ Vectorized inline maximum( const Vectorized& a, const Vectorized& b) { return binary_operator_via_float( +<<<<<<< HEAD static_cast(*)(const Vectorized&, const Vectorized&)>(&maximum), +======= + static_cast (*)( + const Vectorized&, const Vectorized&)>(&maximum), +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) a, b); } @@ -435,7 +616,12 @@ Vectorized inline minimum( const Vectorized& a, const Vectorized& b) { return binary_operator_via_float( +<<<<<<< HEAD static_cast(*)(const Vectorized&, const Vectorized&)>(&minimum), +======= + static_cast (*)( + const Vectorized&, const Vectorized&)>(&minimum), +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) a, b); } @@ -466,24 +652,39 @@ template <> Vectorized inline operator&( const Vectorized& a, const Vectorized& b) { +<<<<<<< HEAD return Vectorized(at_vreinterpretq_bf16_u16(vandq_u16( at_vreinterpretq_u16_bf16(a), at_vreinterpretq_u16_bf16(b)))); +======= + return Vectorized(at_vreinterpretq_bf16_u16( + vandq_u16(at_vreinterpretq_u16_bf16(a), at_vreinterpretq_u16_bf16(b)))); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } template <> Vectorized inline operator|( const Vectorized& a, const Vectorized& b) { +<<<<<<< HEAD return Vectorized(at_vreinterpretq_bf16_u16(vorrq_u16( at_vreinterpretq_u16_bf16(a), at_vreinterpretq_u16_bf16(b)))); +======= + return Vectorized(at_vreinterpretq_bf16_u16( + vorrq_u16(at_vreinterpretq_u16_bf16(a), at_vreinterpretq_u16_bf16(b)))); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } template <> Vectorized inline operator^( const Vectorized& a, const Vectorized& b) { +<<<<<<< HEAD return Vectorized(at_vreinterpretq_bf16_u16(veorq_u16( at_vreinterpretq_u16_bf16(a), at_vreinterpretq_u16_bf16(b)))); +======= + return Vectorized(at_vreinterpretq_bf16_u16( + veorq_u16(at_vreinterpretq_u16_bf16(a), at_vreinterpretq_u16_bf16(b)))); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } inline Vectorized Vectorized::eq( diff --git a/aten/src/ATen/cpu/vec/vec128/vec128_convert.h b/aten/src/ATen/cpu/vec/vec128/vec128_convert.h index 4131802c9923..6bda73664ab8 100644 --- a/aten/src/ATen/cpu/vec/vec128/vec128_convert.h +++ b/aten/src/ATen/cpu/vec/vec128/vec128_convert.h @@ -11,8 +11,12 @@ struct VecConvert< 1, src_t, 1, +<<<<<<< HEAD typename std::enable_if_t, void>> { +======= + typename std::enable_if_t, void>> { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) static inline VectorizedN apply(const VectorizedN& src) { return convert_int8_half_register_to_float(src[0]); } @@ -23,8 +27,12 @@ struct VecConvert< 2, src_t, 1, +<<<<<<< HEAD typename std::enable_if_t, void>> { +======= + typename std::enable_if_t, void>> { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) static inline VectorizedN apply(const VectorizedN& src) { const auto [v0, v1] = convert_int8_to_float(src[0]); return VectorizedN(v0, v1); @@ -39,8 +47,15 @@ struct VecConvert { uint16x8_t u16_8 = vld1q_u16(reinterpret_cast(&src[0])); auto u16_low1 = vget_low_u16(u16_8); auto u16_high1 = vget_high_u16(u16_8); +<<<<<<< HEAD float32x4_t f32x4_0 = vreinterpretq_f32_u32(vshlq_n_u32(vmovl_u16(u16_low1), 16)); float32x4_t f32x4_1 = vreinterpretq_f32_u32(vshlq_n_u32(vmovl_u16(u16_high1), 16)); +======= + float32x4_t f32x4_0 = + vreinterpretq_f32_u32(vshlq_n_u32(vmovl_u16(u16_low1), 16)); + float32x4_t f32x4_1 = + vreinterpretq_f32_u32(vshlq_n_u32(vmovl_u16(u16_high1), 16)); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) result[0] = f32x4_0; result[1] = f32x4_1; return result; @@ -53,7 +68,12 @@ struct VecConvert { const VectorizedN& src) { VectorizedN result; uint16x4_t u16_8 = vld1_u16(reinterpret_cast(&src[0])); +<<<<<<< HEAD float32x4_t f32x4_0 = vreinterpretq_f32_u32(vshlq_n_u32(vmovl_u16(u16_8), 16)); +======= + float32x4_t f32x4_0 = + vreinterpretq_f32_u32(vshlq_n_u32(vmovl_u16(u16_8), 16)); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) result[0] = f32x4_0; return result; } diff --git a/aten/src/ATen/cpu/vec/vec128/vec128_float_neon.h b/aten/src/ATen/cpu/vec/vec128/vec128_float_neon.h index a51a8777fb6d..57a2df9f9ec2 100644 --- a/aten/src/ATen/cpu/vec/vec128/vec128_float_neon.h +++ b/aten/src/ATen/cpu/vec/vec128/vec128_float_neon.h @@ -41,6 +41,7 @@ inline namespace CPU_CAPABILITY { #define USE_SLEEF(sleef_code, non_sleef_code) non_sleef_code #endif +<<<<<<< HEAD template struct BlendRegs { static float32x4_t impl( @@ -51,22 +52,59 @@ template struct BlendRegs{ static float32x4_t impl( const float32x4_t& a, const float32x4_t& b, float32x4_t& res) { +======= +template +struct BlendRegs { + static float32x4_t impl( + const float32x4_t& a, + const float32x4_t& b, + float32x4_t& res); +}; + +template +struct BlendRegs { + static float32x4_t impl( + const float32x4_t& a, + const float32x4_t& b, + float32x4_t& res) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return vsetq_lane_f32(vgetq_lane_f32(b, index), res, index); } }; +<<<<<<< HEAD template struct BlendRegs{ static float32x4_t impl( const float32x4_t& a, const float32x4_t& b, float32x4_t& res) { +======= +template +struct BlendRegs { + static float32x4_t impl( + const float32x4_t& a, + const float32x4_t& b, + float32x4_t& res) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return vsetq_lane_f32(vgetq_lane_f32(a, index), res, index); } }; +<<<<<<< HEAD template <> class Vectorized { private: float32x4_t values; public: +======= +template <> +struct is_vec_specialized_for : std::bool_constant {}; + +template <> +class Vectorized { + private: + float32x4_t values; + + public: +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) using value_type = float; using size_type = int; static constexpr size_type size() { @@ -75,13 +113,19 @@ template <> class Vectorized { Vectorized() {} Vectorized(float32x4_t v) : values(v) {} Vectorized(float val) : values{vdupq_n_f32(val)} {} +<<<<<<< HEAD Vectorized(float val0, float val1, float val2, float val3) : values{val0, val1, val2, val3} {} +======= + Vectorized(float val0, float val1, float val2, float val3) + : values{val0, val1, val2, val3} {} +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) Vectorized(float (&arr)[4]) : Vectorized(arr[0], arr[1], arr[2], arr[3]) {} operator float32x4_t() const { return values; } template +<<<<<<< HEAD static Vectorized blend(const Vectorized& a, const Vectorized& b) { Vectorized vec; vec.values = @@ -100,12 +144,33 @@ template <> class Vectorized { } static Vectorized blendv(const Vectorized& a, const Vectorized& b, const Vectorized& mask) { +======= + static Vectorized blend( + const Vectorized& a, + const Vectorized& b) { + Vectorized vec; + vec.values = BlendRegs < 0, + (mask & 0x01) != 0 > ::impl(a.values, b.values, vec.values); + vec.values = BlendRegs < 1, + (mask & 0x02) != 0 > ::impl(a.values, b.values, vec.values); + vec.values = BlendRegs < 2, + (mask & 0x04) != 0 > ::impl(a.values, b.values, vec.values); + vec.values = BlendRegs < 3, + (mask & 0x08) != 0 > ::impl(a.values, b.values, vec.values); + return vec; + } + static Vectorized blendv( + const Vectorized& a, + const Vectorized& b, + const Vectorized& mask) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // TODO // NB: This requires that each value, i.e., each uint value, // of the mask either all be zeros or all be 1s. // We perhaps need some kind of an assert? // But that will affect performance. Vectorized vec(mask.values); +<<<<<<< HEAD vec.values = vbslq_f32( vreinterpretq_u32_f32(vec.values), b.values, @@ -114,11 +179,22 @@ template <> class Vectorized { } template static Vectorized arange(float base = 0.f, step_t step = static_cast(1)) { +======= + vec.values = + vbslq_f32(vreinterpretq_u32_f32(vec.values), b.values, a.values); + return vec; + } + template + static Vectorized arange( + float base = 0.f, + step_t step = static_cast(1)) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) const Vectorized base_vec(base); const Vectorized step_vec(step); const Vectorized step_sizes(0, 1, 2, 3); return fmadd(step_sizes, step_vec, base_vec); } +<<<<<<< HEAD static Vectorized set(const Vectorized& a, const Vectorized& b, int64_t count = size()) { switch (count) { @@ -157,6 +233,39 @@ template <> class Vectorized { a.values); return vec; } +======= + static Vectorized set( + const Vectorized& a, + const Vectorized& b, + int64_t count = size()) { + switch (count) { + case 0: + return a; + case 1: { + Vectorized vec; + static uint32x4_t mask_low = {0xFFFFFFFF, 0x0, 0x0, 0x0}; + vec.values = vreinterpretq_f32_u32(mask_low); + vec.values = + vbslq_f32(vreinterpretq_u32_f32(vec.values), b.values, a.values); + return vec; + } + case 2: { + Vectorized vec; + static uint32x4_t mask_low = {0xFFFFFFFF, 0xFFFFFFFF, 0x0, 0x0}; + vec.values = vreinterpretq_f32_u32(mask_low); + vec.values = + vbslq_f32(vreinterpretq_u32_f32(vec.values), b.values, a.values); + return vec; + } + case 3: { + Vectorized vec; + static uint32x4_t mask_low = {0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x0}; + vec.values = vreinterpretq_f32_u32(mask_low); + vec.values = + vbslq_f32(vreinterpretq_u32_f32(vec.values), b.values, a.values); + return vec; + } +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } return b; } @@ -204,7 +313,11 @@ template <> class Vectorized { __at_align__ float tmp[size()]; store(tmp); int mask = 0; +<<<<<<< HEAD for (int i = 0; i < size(); ++ i) { +======= + for (int i = 0; i < size(); ++i) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if (tmp[i] == 0.f) { mask |= (1 << i); } @@ -218,7 +331,11 @@ template <> class Vectorized { __at_align__ float tmp[size()]; store(tmp); for (const auto i : c10::irange(size())) { +<<<<<<< HEAD if(_isnan(tmp[i]) || _isinf(tmp[i])) { +======= + if (_isnan(tmp[i]) || _isinf(tmp[i])) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return true; } } @@ -262,6 +379,7 @@ template <> class Vectorized { Vectorized conj() const { return *this; } +<<<<<<< HEAD #define DEFINE_SLEEF_COMPATIBLE_UNARY_ELEMENTWISE_FUNC_WITH_SLEEF_NAME(name, sleef_name) \ Vectorized name() const { \ return USE_SLEEF( \ @@ -272,6 +390,17 @@ template <> class Vectorized { #define DEFINE_SLEEF_COMPATIBLE_UNARY_ELEMENTWISE_FUNC(name) \ DEFINE_SLEEF_COMPATIBLE_UNARY_ELEMENTWISE_FUNC_WITH_SLEEF_NAME(name, Sleef_##name##f4_u10) +======= +#define DEFINE_SLEEF_COMPATIBLE_UNARY_ELEMENTWISE_FUNC_WITH_SLEEF_NAME( \ + name, sleef_name) \ + Vectorized name() const { \ + return USE_SLEEF(Vectorized(sleef_name(values)), map(std::name)); \ + } + +#define DEFINE_SLEEF_COMPATIBLE_UNARY_ELEMENTWISE_FUNC(name) \ + DEFINE_SLEEF_COMPATIBLE_UNARY_ELEMENTWISE_FUNC_WITH_SLEEF_NAME( \ + name, Sleef_##name##f4_u10) +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) DEFINE_SLEEF_COMPATIBLE_UNARY_ELEMENTWISE_FUNC(acos) DEFINE_SLEEF_COMPATIBLE_UNARY_ELEMENTWISE_FUNC(acosh) @@ -280,6 +409,7 @@ template <> class Vectorized { DEFINE_SLEEF_COMPATIBLE_UNARY_ELEMENTWISE_FUNC(atan) DEFINE_SLEEF_COMPATIBLE_UNARY_ELEMENTWISE_FUNC(atanh) +<<<<<<< HEAD #define DEFINE_SLEEF_COMPATIBLE_BINARY_ELEMENTWISE_FUNC_WITH_SLEEF_NAME(name, sleef_name) \ Vectorized name(const Vectorized &arg) const { \ return USE_SLEEF( \ @@ -295,6 +425,28 @@ template <> class Vectorized { DEFINE_SLEEF_COMPATIBLE_BINARY_ELEMENTWISE_FUNC_WITH_SLEEF_NAME(copysign, Sleef_copysignf4) Vectorized erf() const; DEFINE_SLEEF_COMPATIBLE_UNARY_ELEMENTWISE_FUNC_WITH_SLEEF_NAME(erfc, Sleef_erfcf4_u15) +======= +#define DEFINE_SLEEF_COMPATIBLE_BINARY_ELEMENTWISE_FUNC_WITH_SLEEF_NAME( \ + name, sleef_name) \ + Vectorized name(const Vectorized& arg) const { \ + return USE_SLEEF( \ + Vectorized(sleef_name(values, arg.values)), \ + map2(arg, std::name)); \ + } + +#define DEFINE_SLEEF_COMPATIBLE_BINARY_ELEMENTWISE_FUNC(name) \ + DEFINE_SLEEF_COMPATIBLE_BINARY_ELEMENTWISE_FUNC_WITH_SLEEF_NAME( \ + name, Sleef_##name##f4_u10) + + DEFINE_SLEEF_COMPATIBLE_BINARY_ELEMENTWISE_FUNC(atan2) + DEFINE_SLEEF_COMPATIBLE_BINARY_ELEMENTWISE_FUNC_WITH_SLEEF_NAME( + copysign, + Sleef_copysignf4) + Vectorized erf() const; + DEFINE_SLEEF_COMPATIBLE_UNARY_ELEMENTWISE_FUNC_WITH_SLEEF_NAME( + erfc, + Sleef_erfcf4_u15) +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) Vectorized erfinv() const { return map(calc_erfinv); } @@ -304,8 +456,17 @@ template <> class Vectorized { Vectorized exp_u20() const { return exp(); } +<<<<<<< HEAD DEFINE_SLEEF_COMPATIBLE_BINARY_ELEMENTWISE_FUNC_WITH_SLEEF_NAME(fmod, Sleef_fmodf4) DEFINE_SLEEF_COMPATIBLE_BINARY_ELEMENTWISE_FUNC_WITH_SLEEF_NAME(hypot, Sleef_hypotf4_u05) +======= + DEFINE_SLEEF_COMPATIBLE_BINARY_ELEMENTWISE_FUNC_WITH_SLEEF_NAME( + fmod, + Sleef_fmodf4) + DEFINE_SLEEF_COMPATIBLE_BINARY_ELEMENTWISE_FUNC_WITH_SLEEF_NAME( + hypot, + Sleef_hypotf4_u05) +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) Vectorized i0() const { return map(calc_i0); } @@ -315,17 +476,30 @@ template <> class Vectorized { Vectorized digamma() const { return map(calc_digamma); } +<<<<<<< HEAD Vectorized igamma(const Vectorized &x) const { return map2(x, calc_igamma); } Vectorized igammac(const Vectorized &x) const { +======= + Vectorized igamma(const Vectorized& x) const { + return map2(x, calc_igamma); + } + Vectorized igammac(const Vectorized& x) const { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return map2(x, calc_igammac); } DEFINE_SLEEF_COMPATIBLE_UNARY_ELEMENTWISE_FUNC(log) DEFINE_SLEEF_COMPATIBLE_UNARY_ELEMENTWISE_FUNC(log10) DEFINE_SLEEF_COMPATIBLE_UNARY_ELEMENTWISE_FUNC(log1p) DEFINE_SLEEF_COMPATIBLE_UNARY_ELEMENTWISE_FUNC(log2) +<<<<<<< HEAD DEFINE_SLEEF_COMPATIBLE_BINARY_ELEMENTWISE_FUNC_WITH_SLEEF_NAME(nextafter, Sleef_nextafterf4) +======= + DEFINE_SLEEF_COMPATIBLE_BINARY_ELEMENTWISE_FUNC_WITH_SLEEF_NAME( + nextafter, + Sleef_nextafterf4) +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) Vectorized frac() const; DEFINE_SLEEF_COMPATIBLE_UNARY_ELEMENTWISE_FUNC(sin) DEFINE_SLEEF_COMPATIBLE_UNARY_ELEMENTWISE_FUNC(sinh) @@ -338,11 +512,19 @@ template <> class Vectorized { return map(at::native::floor_impl); } Vectorized neg() const { +<<<<<<< HEAD return Vectorized( vnegq_f32(values)); } Vectorized round() const { // We do not use std::round because we would like to round midway numbers to the nearest even integer. +======= + return Vectorized(vnegq_f32(values)); + } + Vectorized round() const { + // We do not use std::round because we would like to round midway numbers to + // the nearest even integer. +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return map(at::native::round_impl); } DEFINE_SLEEF_COMPATIBLE_UNARY_ELEMENTWISE_FUNC(tan) @@ -362,16 +544,27 @@ template <> class Vectorized { } DEFINE_SLEEF_COMPATIBLE_BINARY_ELEMENTWISE_FUNC(pow) Vectorized operator==(const Vectorized& other) const { +<<<<<<< HEAD return Vectorized(vreinterpretq_f32_u32(vceqq_f32(values, other.values))); } Vectorized operator!=(const Vectorized& other) const { float32x4_t r0 = vreinterpretq_f32_u32( vmvnq_u32(vceqq_f32(values, other.values))); +======= + return Vectorized( + vreinterpretq_f32_u32(vceqq_f32(values, other.values))); + } + + Vectorized operator!=(const Vectorized& other) const { + float32x4_t r0 = + vreinterpretq_f32_u32(vmvnq_u32(vceqq_f32(values, other.values))); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return Vectorized(r0); } Vectorized operator<(const Vectorized& other) const { +<<<<<<< HEAD return Vectorized(vreinterpretq_f32_u32(vcltq_f32(values, other.values))); } @@ -385,6 +578,25 @@ template <> class Vectorized { Vectorized operator>=(const Vectorized& other) const { return Vectorized(vreinterpretq_f32_u32(vcgeq_f32(values, other.values))); +======= + return Vectorized( + vreinterpretq_f32_u32(vcltq_f32(values, other.values))); + } + + Vectorized operator<=(const Vectorized& other) const { + return Vectorized( + vreinterpretq_f32_u32(vcleq_f32(values, other.values))); + } + + Vectorized operator>(const Vectorized& other) const { + return Vectorized( + vreinterpretq_f32_u32(vcgtq_f32(values, other.values))); + } + + Vectorized operator>=(const Vectorized& other) const { + return Vectorized( + vreinterpretq_f32_u32(vcgeq_f32(values, other.values))); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } Vectorized eq(const Vectorized& other) const; @@ -396,22 +608,46 @@ template <> class Vectorized { }; template <> +<<<<<<< HEAD Vectorized inline operator+(const Vectorized& a, const Vectorized& b) { +======= +Vectorized inline operator+( + const Vectorized& a, + const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return Vectorized(vaddq_f32(a, b)); } template <> +<<<<<<< HEAD Vectorized inline operator-(const Vectorized& a, const Vectorized& b) { +======= +Vectorized inline operator-( + const Vectorized& a, + const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return Vectorized(vsubq_f32(a, b)); } template <> +<<<<<<< HEAD Vectorized inline operator*(const Vectorized& a, const Vectorized& b) { +======= +Vectorized inline operator*( + const Vectorized& a, + const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return Vectorized(vmulq_f32(a, b)); } template <> +<<<<<<< HEAD Vectorized inline operator/(const Vectorized& a, const Vectorized& b) { +======= +Vectorized inline operator/( + const Vectorized& a, + const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return Vectorized(vdivq_f32(a, b)); } @@ -420,6 +656,7 @@ inline Vectorized Vectorized::frac() const { return *this - this->trunc(); } +<<<<<<< HEAD //Added sleef Implementation for Maximum Vectorized inline maximum(const Vectorized& a, const Vectorized& b) { if(!a.has_inf_nan() && !b.has_inf_nan()){ @@ -430,31 +667,64 @@ Vectorized inline maximum(const Vectorized& a, const Vectorized(vmaxq_f32(a, b)); } +======= +template <> +Vectorized inline maximum( + const Vectorized& a, + const Vectorized& b) { + return Vectorized(vmaxq_f32(a, b)); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } // Implements the IEEE 754 201X `minimum` operation, which propagates NaN if // either input is a NaN. template <> +<<<<<<< HEAD Vectorized inline minimum(const Vectorized& a, const Vectorized& b) { +======= +Vectorized inline minimum( + const Vectorized& a, + const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return Vectorized(vminq_f32(a, b)); } template <> +<<<<<<< HEAD Vectorized inline clamp(const Vectorized& a, const Vectorized& min, const Vectorized& max) { +======= +Vectorized inline clamp( + const Vectorized& a, + const Vectorized& min, + const Vectorized& max) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return minimum(max, maximum(min, a)); } template <> +<<<<<<< HEAD Vectorized inline clamp_max(const Vectorized& a, const Vectorized& max) { +======= +Vectorized inline clamp_max( + const Vectorized& a, + const Vectorized& max) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return minimum(max, a); } template <> +<<<<<<< HEAD Vectorized inline clamp_min(const Vectorized& a, const Vectorized& min) { +======= +Vectorized inline clamp_min( + const Vectorized& a, + const Vectorized& min) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return maximum(min, a); } template <> +<<<<<<< HEAD Vectorized inline operator&(const Vectorized& a, const Vectorized& b) { return Vectorized(vreinterpretq_f32_u32(vandq_u32( vreinterpretq_u32_f32(a), @@ -496,6 +766,58 @@ inline Vectorized Vectorized::lt(const Vectorized& other) c } inline Vectorized Vectorized::le(const Vectorized& other) const { +======= +Vectorized inline operator&( + const Vectorized& a, + const Vectorized& b) { + return Vectorized(vreinterpretq_f32_u32( + vandq_u32(vreinterpretq_u32_f32(a), vreinterpretq_u32_f32(b)))); +} + +template <> +Vectorized inline operator|( + const Vectorized& a, + const Vectorized& b) { + return Vectorized(vreinterpretq_f32_u32( + vorrq_u32(vreinterpretq_u32_f32(a), vreinterpretq_u32_f32(b)))); +} + +template <> +Vectorized inline operator^( + const Vectorized& a, + const Vectorized& b) { + return Vectorized(vreinterpretq_f32_u32( + veorq_u32(vreinterpretq_u32_f32(a), vreinterpretq_u32_f32(b)))); +} + +inline Vectorized Vectorized::eq( + const Vectorized& other) const { + return (*this == other) & Vectorized(1.0f); +} + +inline Vectorized Vectorized::ne( + const Vectorized& other) const { + return (*this != other) & Vectorized(1.0f); +} + +inline Vectorized Vectorized::gt( + const Vectorized& other) const { + return (*this > other) & Vectorized(1.0f); +} + +inline Vectorized Vectorized::ge( + const Vectorized& other) const { + return (*this >= other) & Vectorized(1.0f); +} + +inline Vectorized Vectorized::lt( + const Vectorized& other) const { + return (*this < other) & Vectorized(1.0f); +} + +inline Vectorized Vectorized::le( + const Vectorized& other) const { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return (*this <= other) & Vectorized(1.0f); } @@ -505,7 +827,12 @@ inline void convert(const float* src, int32_t* dst, int64_t n) { #ifndef __msvc_cl__ #pragma unroll #endif +<<<<<<< HEAD for (i = 0; i <= (n - Vectorized::size()); i += Vectorized::size()) { +======= + for (i = 0; i <= (n - Vectorized::size()); + i += Vectorized::size()) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) vst1q_s32(dst + i, vcvtq_s32_f32(vld1q_f32(src + i))); } #ifndef __msvc_cl__ @@ -522,7 +849,12 @@ inline void convert(const int32_t* src, float* dst, int64_t n) { #ifndef __msvc_cl__ #pragma unroll #endif +<<<<<<< HEAD for (i = 0; i <= (n - Vectorized::size()); i += Vectorized::size()) { +======= + for (i = 0; i <= (n - Vectorized::size()); + i += Vectorized::size()) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) vst1q_f32(dst + i, vcvtq_f32_s32(vld1q_s32(src + i))); } #ifndef __msvc_cl__ @@ -534,11 +866,19 @@ inline void convert(const int32_t* src, float* dst, int64_t n) { } template <> +<<<<<<< HEAD Vectorized inline fmadd(const Vectorized& a, const Vectorized& b, const Vectorized& c) { +======= +Vectorized inline fmadd( + const Vectorized& a, + const Vectorized& b, + const Vectorized& c) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return Vectorized(vfmaq_f32(c, a, b)); } template <> +<<<<<<< HEAD Vectorized inline fmsub(const Vectorized& a, const Vectorized& b, const Vectorized& c) { return Vectorized(vnegq_f32(vfmsq_f32(c, a, b))); } @@ -573,9 +913,54 @@ inline Vectorized Vectorized::erf() const{ auto tmp6 = t * tmp5; auto tmp7 = fmadd(tmp6, r, one_vec); return tmp7 ^ sign_mask; +======= +Vectorized inline fmsub( + const Vectorized& a, + const Vectorized& b, + const Vectorized& c) { + return Vectorized(vnegq_f32(vfmsq_f32(c, a, b))); +} + +inline Vectorized Vectorized::erf() const { + // constants + const Vectorized neg_zero_vec(-0.f); + const Vectorized one_vec(1.0f); + const Vectorized p(0.3275911f); + const Vectorized p1(0.254829592f); + const Vectorized p2(-0.284496736f); + const Vectorized p3(1.421413741f); + const Vectorized p4(-1.453152027f); + const Vectorized p5(1.061405429f); + // sign(x) + auto sign_mask = neg_zero_vec & *this; + auto abs_vec = this->abs(); + // t = 1 / (p * abs(x) + 1) + auto tmp0 = fmadd(p, abs_vec, one_vec); + auto t = one_vec / tmp0; + // r = p5 * t ^ 4 + p4 * t ^ 3 + p3 * t ^ 2 + p2 * t + p1 + auto tmp1 = fmadd(p5, t, p4); + auto tmp2 = fmadd(tmp1, t, p3); + auto tmp3 = fmadd(tmp2, t, p2); + auto r = fmadd(tmp3, t, p1); + // - exp(- x * x) + auto pow_2 = (*this) * (*this); + auto neg_pow_2 = pow_2 ^ neg_zero_vec; + auto tmp4 = neg_pow_2.map( + std::exp); // This can be swapped for a faster implementation of exp. + auto tmp5 = tmp4 ^ neg_zero_vec; + // erf(x) = sign(x) * (1 - r * t * exp(- x * x)) + auto tmp6 = t * tmp5; + auto tmp7 = fmadd(tmp6, r, one_vec); + return tmp7 ^ sign_mask; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } #undef DEFINE_SLEEF_COMPATIBLE_BINARY_ELEMENTWISE_FUNC #undef DEFINE_SLEEF_COMPATIBLE_UNARY_ELEMENTWISE_FUNC #endif /* defined(aarch64) */ +<<<<<<< HEAD }} // namespace at::vec::CPU_CAPABILITY +======= +} // namespace CPU_CAPABILITY +} // namespace at::vec +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) diff --git a/aten/src/ATen/cpu/vec/vec128/vec128_half_neon.h b/aten/src/ATen/cpu/vec/vec128/vec128_half_neon.h index e75e9d67655c..eef48a1a8fd6 100644 --- a/aten/src/ATen/cpu/vec/vec128/vec128_half_neon.h +++ b/aten/src/ATen/cpu/vec/vec128/vec128_half_neon.h @@ -58,12 +58,33 @@ struct BlendHalfRegs { } }; +<<<<<<< HEAD // On ARM, Half type supports float16_t->Half constructor and Half->float16_t // conversion template <> class Vectorized : public Vectorized16> { using Base = Vectorized16>; friend Base; +======= +template <> +struct is_vec_specialized_for : std::bool_constant {}; + +// On ARM, Half type supports float16_t->Half constructor and Half->float16_t +// conversion +template <> +class Vectorized : public Vectorized16< + float16x8_t, + c10::Half, + BlendHalfRegs, + Vectorized> { + using Base = Vectorized16< + float16x8_t, + c10::Half, + BlendHalfRegs, + Vectorized>; + friend Base; + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) private: // We use these private map functions to implement various methods Vectorized map_with_vec_float_method( @@ -85,8 +106,15 @@ class Vectorized : public Vectorized16 mv0 = (Vectorized(v00).*m)(Vectorized(second_v00)); Vectorized mv1 = (Vectorized(v01).*m)(Vectorized(second_v01)); +======= + Vectorized mv0 = + (Vectorized(v00).*m)(Vectorized(second_v00)); + Vectorized mv1 = + (Vectorized(v01).*m)(Vectorized(second_v01)); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) float16x4_t r00 = vcvt_f16_f32(mv0); float16x4_t r01 = vcvt_f16_f32(mv1); @@ -102,12 +130,26 @@ class Vectorized : public Vectorized16 mv0 = (Vectorized(v00).*m)(Vectorized(second_v00)); Vectorized mv1 = (Vectorized(v01).*m)(Vectorized(second_v01)); // Assume the operator returns a bitmask, not "real" floats, and // just narrow the bits. All-ones is a NaN and will get mangled by conversion! float16x4_t r00 = vreinterpret_f16_u16(vmovn_u32(vreinterpretq_u32_f32(mv0))); float16x4_t r01 = vreinterpret_f16_u16(vmovn_u32(vreinterpretq_u32_f32(mv1))); +======= + Vectorized mv0 = + (Vectorized(v00).*m)(Vectorized(second_v00)); + Vectorized mv1 = + (Vectorized(v01).*m)(Vectorized(second_v01)); + // Assume the operator returns a bitmask, not "real" floats, and + // just narrow the bits. All-ones is a NaN and will get mangled by + // conversion! + float16x4_t r00 = + vreinterpret_f16_u16(vmovn_u32(vreinterpretq_u32_f32(mv0))); + float16x4_t r01 = + vreinterpret_f16_u16(vmovn_u32(vreinterpretq_u32_f32(mv1))); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // Pack result into Vectorized return Vectorized(vcombine_f16(r00, r01)); @@ -120,10 +162,15 @@ class Vectorized : public Vectorized16>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) Vectorized( value_type val0, value_type val1, @@ -133,6 +180,7 @@ class Vectorized : public Vectorized16 : public Vectorized16>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) static Vectorized blendv( const Vectorized& a, @@ -162,11 +214,18 @@ class Vectorized : public Vectorized16 vec(mask.values); +<<<<<<< HEAD vec.values = vreinterpretq_f16_u16( vbslq_u16( vreinterpretq_u16_f16(vec.values), vreinterpretq_u16_f16(b.values), vreinterpretq_u16_f16(a.values))); +======= + vec.values = vreinterpretq_f16_u16(vbslq_u16( + vreinterpretq_u16_f16(vec.values), + vreinterpretq_u16_f16(b.values), + vreinterpretq_u16_f16(a.values))); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return vec; } static Vectorized set( @@ -181,12 +240,19 @@ class Vectorized : public Vectorized16 vec( vreinterpretq_f16_u16( vbslq_u16( mask, vreinterpretq_u16_f16(b.values), vreinterpretq_u16_f16(a.values)))); +======= + Vectorized vec(vreinterpretq_f16_u16(vbslq_u16( + mask, + vreinterpretq_u16_f16(b.values), + vreinterpretq_u16_f16(a.values)))); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return vec; } @@ -284,50 +350,98 @@ class Vectorized : public Vectorized16 operator==(const Vectorized& other) const { #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +<<<<<<< HEAD return Vectorized(vreinterpretq_f16_u16(vceqq_f16(values, other.values))); #else return map2_bitmask_with_vec_float_method(other, &Vectorized::operator==); +======= + return Vectorized( + vreinterpretq_f16_u16(vceqq_f16(values, other.values))); +#else + return map2_bitmask_with_vec_float_method( + other, &Vectorized::operator==); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #endif } Vectorized operator!=(const Vectorized& other) const { #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +<<<<<<< HEAD return Vectorized(vreinterpretq_f16_u16( vmvnq_u16(vceqq_f16(values, other.values)))); #else return map2_bitmask_with_vec_float_method(other, &Vectorized::operator!=); +======= + return Vectorized( + vreinterpretq_f16_u16(vmvnq_u16(vceqq_f16(values, other.values)))); +#else + return map2_bitmask_with_vec_float_method( + other, &Vectorized::operator!=); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #endif } Vectorized operator<(const Vectorized& other) const { #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +<<<<<<< HEAD return Vectorized(vreinterpretq_f16_u16(vcltq_f16(values, other.values))); #else return map2_bitmask_with_vec_float_method(other, &Vectorized::operator<); +======= + return Vectorized( + vreinterpretq_f16_u16(vcltq_f16(values, other.values))); +#else + return map2_bitmask_with_vec_float_method( + other, &Vectorized::operator<); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #endif } Vectorized operator<=(const Vectorized& other) const { #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +<<<<<<< HEAD return Vectorized(vreinterpretq_f16_u16(vcleq_f16(values, other.values))); #else return map2_bitmask_with_vec_float_method(other, &Vectorized::operator<=); +======= + return Vectorized( + vreinterpretq_f16_u16(vcleq_f16(values, other.values))); +#else + return map2_bitmask_with_vec_float_method( + other, &Vectorized::operator<=); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #endif } Vectorized operator>(const Vectorized& other) const { #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +<<<<<<< HEAD return Vectorized(vreinterpretq_f16_u16(vcgtq_f16(values, other.values))); #else return map2_bitmask_with_vec_float_method(other, &Vectorized::operator>); +======= + return Vectorized( + vreinterpretq_f16_u16(vcgtq_f16(values, other.values))); +#else + return map2_bitmask_with_vec_float_method( + other, &Vectorized::operator>); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #endif } Vectorized operator>=(const Vectorized& other) const { #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +<<<<<<< HEAD return Vectorized(vreinterpretq_f16_u16(vcgeq_f16(values, other.values))); #else return map2_bitmask_with_vec_float_method(other, &Vectorized::operator>=); +======= + return Vectorized( + vreinterpretq_f16_u16(vcgeq_f16(values, other.values))); +#else + return map2_bitmask_with_vec_float_method( + other, &Vectorized::operator>=); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #endif } @@ -339,14 +453,27 @@ class Vectorized : public Vectorized16 le(const Vectorized& other) const; }; // Vectorized +<<<<<<< HEAD inline std::tuple, Vectorized> convert_half_float(const Vectorized& a) { +======= +inline std::tuple, Vectorized> convert_half_float( + const Vectorized& a) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) static_assert(Vectorized::size() == 2 * Vectorized::size()); float16x8_t x = a; float32x4_t x1 = vcvt_f32_f16(vget_low_f16(x)); float32x4_t x2 = vcvt_f32_f16(vget_high_f16(x)); +<<<<<<< HEAD return { Vectorized(x1), Vectorized(x2) }; } inline Vectorized convert_float_half(const Vectorized& a, const Vectorized& b) { +======= + return {Vectorized(x1), Vectorized(x2)}; +} +inline Vectorized convert_float_half( + const Vectorized& a, + const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) static_assert(Vectorized::size() == 2 * Vectorized::size()); float32x4_t x = a; float32x4_t y = b; @@ -363,8 +490,12 @@ Vectorized binary_operator_via_float( const auto [a_float_low, a_float_high] = convert_half_float(a); const auto [b_float_low, b_float_high] = convert_half_float(b); return convert_float_half( +<<<<<<< HEAD op(a_float_low, b_float_low), op(a_float_high, b_float_high)); +======= + op(a_float_low, b_float_low), op(a_float_high, b_float_high)); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } template <> @@ -426,7 +557,12 @@ Vectorized inline maximum( return Vectorized(vmaxq_f16(a, b)); #else return binary_operator_via_float( +<<<<<<< HEAD static_cast(*)(const Vectorized&, const Vectorized&)>(&maximum), +======= + static_cast (*)( + const Vectorized&, const Vectorized&)>(&maximum), +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) a, b); #endif @@ -442,7 +578,12 @@ Vectorized inline minimum( return Vectorized(vminq_f16(a, b)); #else return binary_operator_via_float( +<<<<<<< HEAD static_cast(*)(const Vectorized&, const Vectorized&)>(&minimum), +======= + static_cast (*)( + const Vectorized&, const Vectorized&)>(&minimum), +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) a, b); #endif @@ -474,24 +615,39 @@ template <> Vectorized inline operator&( const Vectorized& a, const Vectorized& b) { +<<<<<<< HEAD return Vectorized(vreinterpretq_f16_u16(vandq_u16( vreinterpretq_u16_f16(a), vreinterpretq_u16_f16(b)))); +======= + return Vectorized(vreinterpretq_f16_u16( + vandq_u16(vreinterpretq_u16_f16(a), vreinterpretq_u16_f16(b)))); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } template <> Vectorized inline operator|( const Vectorized& a, const Vectorized& b) { +<<<<<<< HEAD return Vectorized(vreinterpretq_f16_u16(vorrq_u16( vreinterpretq_u16_f16(a), vreinterpretq_u16_f16(b)))); +======= + return Vectorized(vreinterpretq_f16_u16( + vorrq_u16(vreinterpretq_u16_f16(a), vreinterpretq_u16_f16(b)))); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } template <> Vectorized inline operator^( const Vectorized& a, const Vectorized& b) { +<<<<<<< HEAD return Vectorized(vreinterpretq_f16_u16(veorq_u16( vreinterpretq_u16_f16(a), vreinterpretq_u16_f16(b)))); +======= + return Vectorized(vreinterpretq_f16_u16( + veorq_u16(vreinterpretq_u16_f16(a), vreinterpretq_u16_f16(b)))); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } inline Vectorized Vectorized::eq( diff --git a/aten/src/ATen/cpu/vec/vec128/vec128_reduced_precision_common_neon.h b/aten/src/ATen/cpu/vec/vec128/vec128_reduced_precision_common_neon.h index fec580eef4d6..6a2b88a47cab 100644 --- a/aten/src/ATen/cpu/vec/vec128/vec128_reduced_precision_common_neon.h +++ b/aten/src/ATen/cpu/vec/vec128/vec128_reduced_precision_common_neon.h @@ -10,10 +10,22 @@ inline namespace CPU_CAPABILITY { // Shared implementation between Vectorized and // Vectorized. Uses CRTP to allow derived class // customization. +<<<<<<< HEAD template typename BlendRegs, typename Derived> struct Vectorized16 { protected: VecT values; +======= +template < + typename VecT, + typename ValueT, + template typename BlendRegs, + typename Derived> +struct Vectorized16 { + protected: + VecT values; + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) public: using value_type = ValueT; using size_type = int; @@ -28,7 +40,12 @@ struct Vectorized16 { value_type (*const f)(value_type, value_type)) const { __at_align__ value_type tmp_first[size()]; __at_align__ value_type tmp_second[size()]; +<<<<<<< HEAD static_cast(this)->store(tmp_first); // store this to tmp_first +======= + static_cast(this)->store( + tmp_first); // store this to tmp_first +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) second.store(tmp_second); for (const auto i : c10::irange(size())) { tmp_first[i] = f(tmp_first[i], tmp_second[i]); @@ -47,6 +64,7 @@ struct Vectorized16 { template static Derived blend(const Derived& a, const Derived& b) { Derived vec; +<<<<<<< HEAD vec.values = BlendRegs<0, (mask & 0x01) != 0>::impl( a.values, b.values, vec.values); vec.values = BlendRegs<1, (mask & 0x02) != 0>::impl( @@ -64,6 +82,25 @@ struct Vectorized16 { a.values, b.values, vec.values); vec.values = BlendRegs<7, (mask & 0x80) != 0>::impl( a.values, b.values, vec.values); +======= + vec.values = BlendRegs < 0, + (mask & 0x01) != 0 > ::impl(a.values, b.values, vec.values); + vec.values = BlendRegs < 1, + (mask & 0x02) != 0 > ::impl(a.values, b.values, vec.values); + vec.values = BlendRegs < 2, + (mask & 0x04) != 0 > ::impl(a.values, b.values, vec.values); + vec.values = BlendRegs < 3, + (mask & 0x08) != 0 > ::impl(a.values, b.values, vec.values); + + vec.values = BlendRegs < 4, + (mask & 0x10) != 0 > ::impl(a.values, b.values, vec.values); + vec.values = BlendRegs < 5, + (mask & 0x20) != 0 > ::impl(a.values, b.values, vec.values); + vec.values = BlendRegs < 6, + (mask & 0x40) != 0 > ::impl(a.values, b.values, vec.values); + vec.values = BlendRegs < 7, + (mask & 0x80) != 0 > ::impl(a.values, b.values, vec.values); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return vec; } @@ -120,8 +157,17 @@ struct Vectorized16 { Derived angle() const { auto zero = Derived(0); auto pi = Derived(c10::pi); +<<<<<<< HEAD auto tmp = Derived::blendv(zero, pi, *static_cast(this) < zero); return Derived::blendv(tmp, *static_cast(this), static_cast(this)->isnan()); +======= + auto tmp = + Derived::blendv(zero, pi, *static_cast(this) < zero); + return Derived::blendv( + tmp, + *static_cast(this), + static_cast(this)->isnan()); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } Derived real() const { return *this; @@ -137,6 +183,7 @@ struct Vectorized16 { // converting to FP32, applying the math function, and then converting back to // FP16/BF16. Derived acos() const { +<<<<<<< HEAD return static_cast(this)->map_with_vec_float_method(&Vectorized::acos); } Derived acosh() const { @@ -180,12 +227,73 @@ struct Vectorized16 { } Derived exp_u20() const { return static_cast(this)->map_with_vec_float_method(&Vectorized::exp_u20); +======= + return static_cast(this)->map_with_vec_float_method( + &Vectorized::acos); + } + Derived acosh() const { + return static_cast(this)->map_with_vec_float_method( + &Vectorized::acosh); + } + Derived asin() const { + return static_cast(this)->map_with_vec_float_method( + &Vectorized::asin); + } + Derived asinh() const { + return static_cast(this)->map_with_vec_float_method( + &Vectorized::asinh); + } + Derived atan() const { + return static_cast(this)->map_with_vec_float_method( + &Vectorized::atan); + } + Derived atanh() const { + return static_cast(this)->map_with_vec_float_method( + &Vectorized::atanh); + } + Derived atan2(const Derived& exp) const { + return static_cast(this)->map2_with_vec_float_method( + exp, &Vectorized::atan2); + } + Derived copysign(const Derived& sign) const { + return static_cast(this)->map2_with_vec_float_method( + sign, &Vectorized::copysign); + } + Derived erf() const { + return static_cast(this)->map_with_vec_float_method( + &Vectorized::erf); + } + Derived erfc() const { + return static_cast(this)->map_with_vec_float_method( + &Vectorized::erfc); + } + Derived erfinv() const { + return static_cast(this)->map_with_vec_float_method( + &Vectorized::erfinv); + } + Derived exp() const { + return static_cast(this)->map_with_vec_float_method( + &Vectorized::exp); + } + Derived exp2() const { + return static_cast(this)->map_with_vec_float_method( + &Vectorized::exp2); + } + Derived expm1() const { + return static_cast(this)->map_with_vec_float_method( + &Vectorized::expm1); + } + Derived exp_u20() const { + return static_cast(this)->map_with_vec_float_method( + &Vectorized::exp_u20); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } Derived fmod(const Derived& q) const { // This function is questionable with a conversion, so we use map2 return map2(q, std::fmod); } Derived hypot(const Derived& b) const { +<<<<<<< HEAD return static_cast(this)->map2_with_vec_float_method(b, &Vectorized::hypot); } Derived i0() const { @@ -214,12 +322,53 @@ struct Vectorized16 { } Derived log2() const { return static_cast(this)->map_with_vec_float_method(&Vectorized::log2); +======= + return static_cast(this)->map2_with_vec_float_method( + b, &Vectorized::hypot); + } + Derived i0() const { + return static_cast(this)->map_with_vec_float_method( + &Vectorized::i0); + } + Derived i0e() const { + return static_cast(this)->map_with_vec_float_method( + &Vectorized::i0e); + } + Derived digamma() const { + return static_cast(this)->map_with_vec_float_method( + &Vectorized::digamma); + } + Derived igamma(const Derived& x) const { + return static_cast(this)->map2_with_vec_float_method( + x, &Vectorized::igamma); + } + Derived igammac(const Derived& x) const { + return static_cast(this)->map2_with_vec_float_method( + x, &Vectorized::igammac); + } + Derived log() const { + return static_cast(this)->map_with_vec_float_method( + &Vectorized::log); + } + Derived log10() const { + return static_cast(this)->map_with_vec_float_method( + &Vectorized::log10); + } + Derived log1p() const { + return static_cast(this)->map_with_vec_float_method( + &Vectorized::log1p); + } + Derived log2() const { + return static_cast(this)->map_with_vec_float_method( + &Vectorized::log2); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } Derived nextafter(const Derived& b) const { // This function does not make sense with conversion, so we use map2 return map2(b, std::nextafter); } Derived sin() const { +<<<<<<< HEAD return static_cast(this)->map_with_vec_float_method(&Vectorized::sin); } Derived sinh() const { @@ -230,6 +379,22 @@ struct Vectorized16 { } Derived cosh() const { return static_cast(this)->map_with_vec_float_method(&Vectorized::cosh); +======= + return static_cast(this)->map_with_vec_float_method( + &Vectorized::sin); + } + Derived sinh() const { + return static_cast(this)->map_with_vec_float_method( + &Vectorized::sinh); + } + Derived cos() const { + return static_cast(this)->map_with_vec_float_method( + &Vectorized::cos); + } + Derived cosh() const { + return static_cast(this)->map_with_vec_float_method( + &Vectorized::cosh); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } Derived ceil() const { // This function is questionable with a conversion, so we use map @@ -244,6 +409,7 @@ struct Vectorized16 { return map(at::native::round_impl); } Derived tan() const { +<<<<<<< HEAD return static_cast(this)->map_with_vec_float_method(&Vectorized::tan); } Derived tanh() const { @@ -251,16 +417,36 @@ struct Vectorized16 { } Derived lgamma() const { return static_cast(this)->map_with_vec_float_method(&Vectorized::lgamma); +======= + return static_cast(this)->map_with_vec_float_method( + &Vectorized::tan); + } + Derived tanh() const { + return static_cast(this)->map_with_vec_float_method( + &Vectorized::tanh); + } + Derived lgamma() const { + return static_cast(this)->map_with_vec_float_method( + &Vectorized::lgamma); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } Derived rsqrt() const { return static_cast(this)->sqrt().reciprocal(); } Derived pow(const Derived& exp) const { +<<<<<<< HEAD return static_cast(this)->map2_with_vec_float_method(exp, &Vectorized::pow); } }; +======= + return static_cast(this)->map2_with_vec_float_method( + exp, &Vectorized::pow); + } +}; + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } // namespace CPU_CAPABILITY } // namespace at::vec diff --git a/aten/src/ATen/cpu/vec/vec256/missing_vld1_neon.h b/aten/src/ATen/cpu/vec/vec256/missing_vld1_neon.h index 5540c8bc782f..a5dfbcc24338 100644 --- a/aten/src/ATen/cpu/vec/vec256/missing_vld1_neon.h +++ b/aten/src/ATen/cpu/vec/vec256/missing_vld1_neon.h @@ -1,261 +1,459 @@ /* Workaround for missing vld1_*_x2 and vst1_*_x2 intrinsics in gcc-7. */ __extension__ extern __inline uint8x8x2_t +<<<<<<< HEAD __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vld1_u8_x2 (const uint8_t *__a) { uint8x8x2_t ret; asm volatile("ld1 {%S0.8b - %T0.8b}, %1" : "=w" (ret) : "Q"(*__a)); +======= + __attribute__((__always_inline__, __gnu_inline__, __artificial__)) + vld1_u8_x2(const uint8_t* __a) { + uint8x8x2_t ret; + asm volatile("ld1 {%S0.8b - %T0.8b}, %1" : "=w"(ret) : "Q"(*__a)); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return ret; } __extension__ extern __inline int8x8x2_t +<<<<<<< HEAD __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vld1_s8_x2 (const int8_t *__a) { int8x8x2_t ret; asm volatile("ld1 {%S0.8b - %T0.8b}, %1" : "=w" (ret) : "Q"(*__a)); +======= + __attribute__((__always_inline__, __gnu_inline__, __artificial__)) + vld1_s8_x2(const int8_t* __a) { + int8x8x2_t ret; + asm volatile("ld1 {%S0.8b - %T0.8b}, %1" : "=w"(ret) : "Q"(*__a)); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return ret; } __extension__ extern __inline uint16x4x2_t +<<<<<<< HEAD __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vld1_u16_x2 (const uint16_t *__a) { uint16x4x2_t ret; asm volatile("ld1 {%S0.4h - %T0.4h}, %1" : "=w" (ret) : "Q"(*__a)); +======= + __attribute__((__always_inline__, __gnu_inline__, __artificial__)) + vld1_u16_x2(const uint16_t* __a) { + uint16x4x2_t ret; + asm volatile("ld1 {%S0.4h - %T0.4h}, %1" : "=w"(ret) : "Q"(*__a)); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return ret; } __extension__ extern __inline int16x4x2_t +<<<<<<< HEAD __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vld1_s16_x2 (const int16_t *__a) { int16x4x2_t ret; asm volatile("ld1 {%S0.4h - %T0.4h}, %1" : "=w" (ret) : "Q"(*__a)); +======= + __attribute__((__always_inline__, __gnu_inline__, __artificial__)) + vld1_s16_x2(const int16_t* __a) { + int16x4x2_t ret; + asm volatile("ld1 {%S0.4h - %T0.4h}, %1" : "=w"(ret) : "Q"(*__a)); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return ret; } __extension__ extern __inline uint32x2x2_t +<<<<<<< HEAD __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vld1_u32_x2 (const uint32_t *__a) { uint32x2x2_t ret; asm volatile("ld1 {%S0.2s - %T0.2s}, %1" : "=w" (ret) : "Q"(*__a)); +======= + __attribute__((__always_inline__, __gnu_inline__, __artificial__)) + vld1_u32_x2(const uint32_t* __a) { + uint32x2x2_t ret; + asm volatile("ld1 {%S0.2s - %T0.2s}, %1" : "=w"(ret) : "Q"(*__a)); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return ret; } __extension__ extern __inline int32x2x2_t +<<<<<<< HEAD __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vld1_s32_x2 (const int32_t *__a) { int32x2x2_t ret; asm volatile("ld1 {%S0.2s - %T0.2s}, %1" : "=w" (ret) : "Q"(*__a)); +======= + __attribute__((__always_inline__, __gnu_inline__, __artificial__)) + vld1_s32_x2(const int32_t* __a) { + int32x2x2_t ret; + asm volatile("ld1 {%S0.2s - %T0.2s}, %1" : "=w"(ret) : "Q"(*__a)); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return ret; } __extension__ extern __inline uint64x1x2_t +<<<<<<< HEAD __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vld1_u64_x2 (const uint64_t *__a) { uint64x1x2_t ret; asm volatile("ld1 {%S0.1d - %T0.1d}, %1" : "=w" (ret) : "Q"(*__a)); +======= + __attribute__((__always_inline__, __gnu_inline__, __artificial__)) + vld1_u64_x2(const uint64_t* __a) { + uint64x1x2_t ret; + asm volatile("ld1 {%S0.1d - %T0.1d}, %1" : "=w"(ret) : "Q"(*__a)); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return ret; } __extension__ extern __inline int64x1x2_t +<<<<<<< HEAD __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vld1_s64_x2 (const int64_t *__a) { int64x1x2_t ret; __builtin_aarch64_simd_oi __o; asm volatile("ld1 {%S0.1d - %T0.1d}, %1" : "=w" (ret) : "Q"(*__a)); +======= + __attribute__((__always_inline__, __gnu_inline__, __artificial__)) + vld1_s64_x2(const int64_t* __a) { + int64x1x2_t ret; + __builtin_aarch64_simd_oi __o; + asm volatile("ld1 {%S0.1d - %T0.1d}, %1" : "=w"(ret) : "Q"(*__a)); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return ret; } __extension__ extern __inline float16x4x2_t +<<<<<<< HEAD __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vld1_f16_x2 (const float16_t *__a) { float16x4x2_t ret; asm volatile("ld1 {%S0.4h - %T0.4h}, %1" : "=w" (ret) : "Q"(*__a)); +======= + __attribute__((__always_inline__, __gnu_inline__, __artificial__)) + vld1_f16_x2(const float16_t* __a) { + float16x4x2_t ret; + asm volatile("ld1 {%S0.4h - %T0.4h}, %1" : "=w"(ret) : "Q"(*__a)); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return ret; } __extension__ extern __inline float32x2x2_t +<<<<<<< HEAD __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vld1_f32_x2 (const float32_t *__a) { float32x2x2_t ret; asm volatile("ld1 {%S0.2s - %T0.2s}, %1" : "=w" (ret) : "Q"(*__a)); +======= + __attribute__((__always_inline__, __gnu_inline__, __artificial__)) + vld1_f32_x2(const float32_t* __a) { + float32x2x2_t ret; + asm volatile("ld1 {%S0.2s - %T0.2s}, %1" : "=w"(ret) : "Q"(*__a)); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return ret; } __extension__ extern __inline float64x1x2_t +<<<<<<< HEAD __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vld1_f64_x2 (const float64_t *__a) { float64x1x2_t ret; asm volatile("ld1 {%S0.1d - %T0.1d}, %1" : "=w" (ret) : "Q"(*__a)); +======= + __attribute__((__always_inline__, __gnu_inline__, __artificial__)) + vld1_f64_x2(const float64_t* __a) { + float64x1x2_t ret; + asm volatile("ld1 {%S0.1d - %T0.1d}, %1" : "=w"(ret) : "Q"(*__a)); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return ret; } __extension__ extern __inline poly8x8x2_t +<<<<<<< HEAD __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vld1_p8_x2 (const poly8_t *__a) { poly8x8x2_t ret; asm volatile("ld1 {%S0.8b - %T0.8b}, %1" : "=w" (ret) : "Q"(*__a)); +======= + __attribute__((__always_inline__, __gnu_inline__, __artificial__)) + vld1_p8_x2(const poly8_t* __a) { + poly8x8x2_t ret; + asm volatile("ld1 {%S0.8b - %T0.8b}, %1" : "=w"(ret) : "Q"(*__a)); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return ret; } __extension__ extern __inline poly16x4x2_t +<<<<<<< HEAD __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vld1_p16_x2 (const poly16_t *__a) { poly16x4x2_t ret; asm volatile("ld1 {%S0.4h - %T0.4h}, %1" : "=w" (ret) : "Q"(*__a)); +======= + __attribute__((__always_inline__, __gnu_inline__, __artificial__)) + vld1_p16_x2(const poly16_t* __a) { + poly16x4x2_t ret; + asm volatile("ld1 {%S0.4h - %T0.4h}, %1" : "=w"(ret) : "Q"(*__a)); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return ret; } __extension__ extern __inline poly64x1x2_t +<<<<<<< HEAD __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vld1_p64_x2 (const poly64_t *__a) { poly64x1x2_t ret; asm volatile("ld1 {%S0.1d - %T0.1d}, %1" : "=w" (ret) : "Q"(*__a)); +======= + __attribute__((__always_inline__, __gnu_inline__, __artificial__)) + vld1_p64_x2(const poly64_t* __a) { + poly64x1x2_t ret; + asm volatile("ld1 {%S0.1d - %T0.1d}, %1" : "=w"(ret) : "Q"(*__a)); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return ret; } __extension__ extern __inline uint8x16x2_t +<<<<<<< HEAD __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vld1q_u8_x2 (const uint8_t *__a) { uint8x16x2_t ret; asm volatile("ld1 {%S0.16b - %T0.16b}, %1" : "=w" (ret) : "Q"(*__a)); +======= + __attribute__((__always_inline__, __gnu_inline__, __artificial__)) + vld1q_u8_x2(const uint8_t* __a) { + uint8x16x2_t ret; + asm volatile("ld1 {%S0.16b - %T0.16b}, %1" : "=w"(ret) : "Q"(*__a)); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return ret; } __extension__ extern __inline int8x16x2_t +<<<<<<< HEAD __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vld1q_s8_x2 (const int8_t *__a) { int8x16x2_t ret; asm volatile("ld1 {%S0.16b - %T0.16b}, %1" : "=w" (ret) : "Q"(*__a)); +======= + __attribute__((__always_inline__, __gnu_inline__, __artificial__)) + vld1q_s8_x2(const int8_t* __a) { + int8x16x2_t ret; + asm volatile("ld1 {%S0.16b - %T0.16b}, %1" : "=w"(ret) : "Q"(*__a)); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return ret; } __extension__ extern __inline uint16x8x2_t +<<<<<<< HEAD __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vld1q_u16_x2 (const uint16_t *__a) { uint16x8x2_t ret; asm volatile("ld1 {%S0.8h - %T0.8h}, %1" : "=w" (ret) : "Q"(*__a)); +======= + __attribute__((__always_inline__, __gnu_inline__, __artificial__)) + vld1q_u16_x2(const uint16_t* __a) { + uint16x8x2_t ret; + asm volatile("ld1 {%S0.8h - %T0.8h}, %1" : "=w"(ret) : "Q"(*__a)); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return ret; } __extension__ extern __inline int16x8x2_t +<<<<<<< HEAD __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vld1q_s16_x2 (const int16_t *__a) { int16x8x2_t ret; asm volatile("ld1 {%S0.8h - %T0.8h}, %1" : "=w" (ret) : "Q"(*__a)); +======= + __attribute__((__always_inline__, __gnu_inline__, __artificial__)) + vld1q_s16_x2(const int16_t* __a) { + int16x8x2_t ret; + asm volatile("ld1 {%S0.8h - %T0.8h}, %1" : "=w"(ret) : "Q"(*__a)); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return ret; } __extension__ extern __inline uint32x4x2_t +<<<<<<< HEAD __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vld1q_u32_x2 (const uint32_t *__a) { uint32x4x2_t ret; asm volatile("ld1 {%S0.4s - %T0.4s}, %1" : "=w" (ret) : "Q"(*__a)); +======= + __attribute__((__always_inline__, __gnu_inline__, __artificial__)) + vld1q_u32_x2(const uint32_t* __a) { + uint32x4x2_t ret; + asm volatile("ld1 {%S0.4s - %T0.4s}, %1" : "=w"(ret) : "Q"(*__a)); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return ret; } __extension__ extern __inline int32x4x2_t +<<<<<<< HEAD __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vld1q_s32_x2 (const int32_t *__a) { int32x4x2_t ret; asm volatile("ld1 {%S0.4s - %T0.4s}, %1" : "=w" (ret) : "Q"(*__a)); +======= + __attribute__((__always_inline__, __gnu_inline__, __artificial__)) + vld1q_s32_x2(const int32_t* __a) { + int32x4x2_t ret; + asm volatile("ld1 {%S0.4s - %T0.4s}, %1" : "=w"(ret) : "Q"(*__a)); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return ret; } __extension__ extern __inline uint64x2x2_t +<<<<<<< HEAD __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vld1q_u64_x2 (const uint64_t *__a) { uint64x2x2_t ret; asm volatile("ld1 {%S0.2d - %T0.2d}, %1" : "=w" (ret) : "Q"(*__a)); +======= + __attribute__((__always_inline__, __gnu_inline__, __artificial__)) + vld1q_u64_x2(const uint64_t* __a) { + uint64x2x2_t ret; + asm volatile("ld1 {%S0.2d - %T0.2d}, %1" : "=w"(ret) : "Q"(*__a)); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return ret; } __extension__ extern __inline int64x2x2_t +<<<<<<< HEAD __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vld1q_s64_x2 (const int64_t *__a) { int64x2x2_t ret; asm volatile("ld1 {%S0.2d - %T0.2d}, %1" : "=w" (ret) : "Q"(*__a)); +======= + __attribute__((__always_inline__, __gnu_inline__, __artificial__)) + vld1q_s64_x2(const int64_t* __a) { + int64x2x2_t ret; + asm volatile("ld1 {%S0.2d - %T0.2d}, %1" : "=w"(ret) : "Q"(*__a)); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return ret; } __extension__ extern __inline float16x8x2_t +<<<<<<< HEAD __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vld1q_f16_x2 (const float16_t *__a) { float16x8x2_t ret; asm volatile("ld1 {%S0.8h - %T0.8h}, %1" : "=w" (ret) : "Q"(*__a)); +======= + __attribute__((__always_inline__, __gnu_inline__, __artificial__)) + vld1q_f16_x2(const float16_t* __a) { + float16x8x2_t ret; + asm volatile("ld1 {%S0.8h - %T0.8h}, %1" : "=w"(ret) : "Q"(*__a)); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return ret; } __extension__ extern __inline float32x4x2_t +<<<<<<< HEAD __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vld1q_f32_x2 (const float32_t *__a) { float32x4x2_t ret; asm volatile("ld1 {%S0.4s - %T0.4s}, %1" : "=w" (ret) : "Q"(*__a)); +======= + __attribute__((__always_inline__, __gnu_inline__, __artificial__)) + vld1q_f32_x2(const float32_t* __a) { + float32x4x2_t ret; + asm volatile("ld1 {%S0.4s - %T0.4s}, %1" : "=w"(ret) : "Q"(*__a)); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return ret; } __extension__ extern __inline float64x2x2_t +<<<<<<< HEAD __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vld1q_f64_x2 (const float64_t *__a) { float64x2x2_t ret; asm volatile("ld1 {%S0.2d - %T0.2d}, %1" : "=w" (ret) : "Q"(*__a)); +======= + __attribute__((__always_inline__, __gnu_inline__, __artificial__)) + vld1q_f64_x2(const float64_t* __a) { + float64x2x2_t ret; + asm volatile("ld1 {%S0.2d - %T0.2d}, %1" : "=w"(ret) : "Q"(*__a)); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return ret; } __extension__ extern __inline poly8x16x2_t +<<<<<<< HEAD __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vld1q_p8_x2 (const poly8_t *__a) { poly8x16x2_t ret; asm volatile("ld1 {%S0.16b - %T0.16b}, %1" : "=w" (ret) : "Q"(*__a)); +======= + __attribute__((__always_inline__, __gnu_inline__, __artificial__)) + vld1q_p8_x2(const poly8_t* __a) { + poly8x16x2_t ret; + asm volatile("ld1 {%S0.16b - %T0.16b}, %1" : "=w"(ret) : "Q"(*__a)); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return ret; } __extension__ extern __inline poly16x8x2_t +<<<<<<< HEAD __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vld1q_p16_x2 (const poly16_t *__a) { poly16x8x2_t ret; asm volatile("ld1 {%S0.8h - %T0.8h}, %1" : "=w" (ret) : "Q"(*__a)); +======= + __attribute__((__always_inline__, __gnu_inline__, __artificial__)) + vld1q_p16_x2(const poly16_t* __a) { + poly16x8x2_t ret; + asm volatile("ld1 {%S0.8h - %T0.8h}, %1" : "=w"(ret) : "Q"(*__a)); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return ret; } __extension__ extern __inline poly64x2x2_t +<<<<<<< HEAD __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vld1q_p64_x2 (const poly64_t *__a) { poly64x2x2_t ret; asm volatile("ld1 {%S0.2d - %T0.2d}, %1" : "=w" (ret) : "Q"(*__a)); +======= + __attribute__((__always_inline__, __gnu_inline__, __artificial__)) + vld1q_p64_x2(const poly64_t* __a) { + poly64x2x2_t ret; + asm volatile("ld1 {%S0.2d - %T0.2d}, %1" : "=w"(ret) : "Q"(*__a)); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return ret; } /* vst1x2 */ __extension__ extern __inline void +<<<<<<< HEAD __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vst1_s64_x2 (int64_t * __a, int64x1x2_t val) { @@ -449,4 +647,171 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vst1q_p64_x2 (poly64_t * __a, poly64x2x2_t val) { asm volatile("st1 {%S1.2d - %T1.2d}, %0" : "=Q" (*__a) : "w" (val)); +======= + __attribute__((__always_inline__, __gnu_inline__, __artificial__)) + vst1_s64_x2(int64_t* __a, int64x1x2_t val) { + asm volatile("st1 {%S1.1d - %T1.1d}, %0" : "=Q"(*__a) : "w"(val)); +} + +__extension__ extern __inline void + __attribute__((__always_inline__, __gnu_inline__, __artificial__)) + vst1_u64_x2(uint64_t* __a, uint64x1x2_t val) { + asm volatile("st1 {%S1.1d - %T1.1d}, %0" : "=Q"(*__a) : "w"(val)); +} + +__extension__ extern __inline void + __attribute__((__always_inline__, __gnu_inline__, __artificial__)) + vst1_f64_x2(float64_t* __a, float64x1x2_t val) { + asm volatile("st1 {%S1.1d - %T1.1d}, %0" : "=Q"(*__a) : "w"(val)); +} + +__extension__ extern __inline void + __attribute__((__always_inline__, __gnu_inline__, __artificial__)) + vst1_s8_x2(int8_t* __a, int8x8x2_t val) { + asm volatile("st1 {%S1.8b - %T1.8b}, %0" : "=Q"(*__a) : "w"(val)); +} + +__extension__ extern __inline void + __attribute__((__always_inline__, __gnu_inline__, __artificial__)) + vst1_p8_x2(poly8_t* __a, poly8x8x2_t val) { + asm volatile("st1 {%S1.8b - %T1.8b}, %0" : "=Q"(*__a) : "w"(val)); +} + +__extension__ extern __inline void + __attribute__((__always_inline__, __gnu_inline__, __artificial__)) + vst1_s16_x2(int16_t* __a, int16x4x2_t val) { + asm volatile("st1 {%S1.4h - %T1.4h}, %0" : "=Q"(*__a) : "w"(val)); +} + +__extension__ extern __inline void + __attribute__((__always_inline__, __gnu_inline__, __artificial__)) + vst1_p16_x2(poly16_t* __a, poly16x4x2_t val) { + asm volatile("st1 {%S1.4h - %T1.4h}, %0" : "=Q"(*__a) : "w"(val)); +} + +__extension__ extern __inline void + __attribute__((__always_inline__, __gnu_inline__, __artificial__)) + vst1_s32_x2(int32_t* __a, int32x2x2_t val) { + asm volatile("st1 {%S1.2s - %T1.2s}, %0" : "=Q"(*__a) : "w"(val)); +} + +__extension__ extern __inline void + __attribute__((__always_inline__, __gnu_inline__, __artificial__)) + vst1_u8_x2(uint8_t* __a, uint8x8x2_t val) { + asm volatile("st1 {%S1.8b - %T1.8b}, %0" : "=Q"(*__a) : "w"(val)); +} + +__extension__ extern __inline void + __attribute__((__always_inline__, __gnu_inline__, __artificial__)) + vst1_u16_x2(uint16_t* __a, uint16x4x2_t val) { + asm volatile("st1 {%S1.4h - %T1.4h}, %0" : "=Q"(*__a) : "w"(val)); +} + +__extension__ extern __inline void + __attribute__((__always_inline__, __gnu_inline__, __artificial__)) + vst1_u32_x2(uint32_t* __a, uint32x2x2_t val) { + asm volatile("st1 {%S1.2s - %T1.2s}, %0" : "=Q"(*__a) : "w"(val)); +} + +__extension__ extern __inline void + __attribute__((__always_inline__, __gnu_inline__, __artificial__)) + vst1_f16_x2(float16_t* __a, float16x4x2_t val) { + asm volatile("st1 {%S1.4h - %T1.4h}, %0" : "=Q"(*__a) : "w"(val)); +} + +__extension__ extern __inline void + __attribute__((__always_inline__, __gnu_inline__, __artificial__)) + vst1_f32_x2(float32_t* __a, float32x2x2_t val) { + asm volatile("st1 {%S1.2s - %T1.2s}, %0" : "=Q"(*__a) : "w"(val)); +} + +__extension__ extern __inline void + __attribute__((__always_inline__, __gnu_inline__, __artificial__)) + vst1_p64_x2(poly64_t* __a, poly64x1x2_t val) { + asm volatile("st1 {%S1.1d - %T1.1d}, %0" : "=Q"(*__a) : "w"(val)); +} + +__extension__ extern __inline void + __attribute__((__always_inline__, __gnu_inline__, __artificial__)) + vst1q_s8_x2(int8_t* __a, int8x16x2_t val) { + asm volatile("st1 {%S1.16b - %T1.16b}, %0" : "=Q"(*__a) : "w"(val)); +} + +__extension__ extern __inline void + __attribute__((__always_inline__, __gnu_inline__, __artificial__)) + vst1q_p8_x2(poly8_t* __a, poly8x16x2_t val) { + asm volatile("st1 {%S1.16b - %T1.16b}, %0" : "=Q"(*__a) : "w"(val)); +} + +__extension__ extern __inline void + __attribute__((__always_inline__, __gnu_inline__, __artificial__)) + vst1q_s16_x2(int16_t* __a, int16x8x2_t val) { + asm volatile("st1 {%S1.8h - %T1.8h}, %0" : "=Q"(*__a) : "w"(val)); +} + +__extension__ extern __inline void + __attribute__((__always_inline__, __gnu_inline__, __artificial__)) + vst1q_p16_x2(poly16_t* __a, poly16x8x2_t val) { + asm volatile("st1 {%S1.8h - %T1.8h}, %0" : "=Q"(*__a) : "w"(val)); +} + +__extension__ extern __inline void + __attribute__((__always_inline__, __gnu_inline__, __artificial__)) + vst1q_s32_x2(int32_t* __a, int32x4x2_t val) { + asm volatile("st1 {%S1.4s - %T1.4s}, %0" : "=Q"(*__a) : "w"(val)); +} + +__extension__ extern __inline void + __attribute__((__always_inline__, __gnu_inline__, __artificial__)) + vst1q_s64_x2(int64_t* __a, int64x2x2_t val) { + asm volatile("st1 {%S1.2d - %T1.2d}, %0" : "=Q"(*__a) : "w"(val)); +} + +__extension__ extern __inline void + __attribute__((__always_inline__, __gnu_inline__, __artificial__)) + vst1q_u8_x2(uint8_t* __a, uint8x16x2_t val) { + asm volatile("st1 {%S1.16b - %T1.16b}, %0" : "=Q"(*__a) : "w"(val)); +} + +__extension__ extern __inline void + __attribute__((__always_inline__, __gnu_inline__, __artificial__)) + vst1q_u16_x2(uint16_t* __a, uint16x8x2_t val) { + asm volatile("st1 {%S1.8h - %T1.8h}, %0" : "=Q"(*__a) : "w"(val)); +} + +__extension__ extern __inline void + __attribute__((__always_inline__, __gnu_inline__, __artificial__)) + vst1q_u32_x2(uint32_t* __a, uint32x4x2_t val) { + asm volatile("st1 {%S1.4s - %T1.4s}, %0" : "=Q"(*__a) : "w"(val)); +} + +__extension__ extern __inline void + __attribute__((__always_inline__, __gnu_inline__, __artificial__)) + vst1q_u64_x2(uint64_t* __a, uint64x2x2_t val) { + asm volatile("st1 {%S1.2d - %T1.2d}, %0" : "=Q"(*__a) : "w"(val)); +} + +__extension__ extern __inline void + __attribute__((__always_inline__, __gnu_inline__, __artificial__)) + vst1q_f16_x2(float16_t* __a, float16x8x2_t val) { + asm volatile("st1 {%S1.8h - %T1.8h}, %0" : "=Q"(*__a) : "w"(val)); +} + +__extension__ extern __inline void + __attribute__((__always_inline__, __gnu_inline__, __artificial__)) + vst1q_f32_x2(float32_t* __a, float32x4x2_t val) { + asm volatile("st1 {%S1.4s - %T1.4s}, %0" : "=Q"(*__a) : "w"(val)); +} + +__extension__ extern __inline void + __attribute__((__always_inline__, __gnu_inline__, __artificial__)) + vst1q_f64_x2(float64_t* __a, float64x2x2_t val) { + asm volatile("st1 {%S1.2d - %T1.2d}, %0" : "=Q"(*__a) : "w"(val)); +} + +__extension__ extern __inline void + __attribute__((__always_inline__, __gnu_inline__, __artificial__)) + vst1q_p64_x2(poly64_t* __a, poly64x2x2_t val) { + asm volatile("st1 {%S1.2d - %T1.2d}, %0" : "=Q"(*__a) : "w"(val)); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } diff --git a/aten/src/ATen/cpu/vec/vec256/missing_vst1_neon.h b/aten/src/ATen/cpu/vec/vec256/missing_vst1_neon.h index 711d16f9b231..9a327d11f1f6 100644 --- a/aten/src/ATen/cpu/vec/vec256/missing_vst1_neon.h +++ b/aten/src/ATen/cpu/vec/vec256/missing_vst1_neon.h @@ -1,8 +1,14 @@ /* Workaround for missing vst1q_f32_x2 in gcc-8. */ __extension__ extern __inline void +<<<<<<< HEAD __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vst1q_f32_x2 (float32_t * __a, float32x4x2_t val) { asm volatile("st1 {%S1.4s - %T1.4s}, %0" : "=Q" (*__a) : "w" (val)); +======= + __attribute__((__always_inline__, __gnu_inline__, __artificial__)) + vst1q_f32_x2(float32_t* __a, float32x4x2_t val) { + asm volatile("st1 {%S1.4s - %T1.4s}, %0" : "=Q"(*__a) : "w"(val)); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } diff --git a/aten/src/ATen/cpu/vec/vec256/vec256.h b/aten/src/ATen/cpu/vec/vec256/vec256.h index 83bb70bdbcbf..42847bcd0c51 100644 --- a/aten/src/ATen/cpu/vec/vec256/vec256.h +++ b/aten/src/ATen/cpu/vec/vec256/vec256.h @@ -6,6 +6,7 @@ #include #include +<<<<<<< HEAD #if !(defined(__VSX__) || defined(CPU_CAPABILITY_VSX) || defined(CPU_CAPABILITY_ZVECTOR)) #if defined(CPU_CAPABILITY_SVE256) #include @@ -24,6 +25,35 @@ #include #include #include +======= +#if !( \ + defined(__VSX__) || defined(CPU_CAPABILITY_VSX) || \ + defined(CPU_CAPABILITY_ZVECTOR)) +#if defined(CPU_CAPABILITY_SVE256) +#include +#else +// clang-format off +#include +#include +#include +#include +#endif +#if !defined(CPU_CAPABILITY_SVE256) || !defined(__ARM_FEATURE_BF16) +#include +#endif +#include +#include +#include +// clang-format on +#elif defined(__VSX__) || defined(CPU_CAPABILITY_VSX) +#include +#else +// clang-format off +#include +#include +#include +// clang-format on +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #endif #include @@ -75,34 +105,56 @@ std::ostream& operator<<(std::ostream& stream, const Vectorized& vec) { return stream; } +<<<<<<< HEAD +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #if defined(CPU_CAPABILITY_AVX2) // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CAST (AVX2) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +<<<<<<< HEAD template<> +======= +template <> +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) inline Vectorized cast(const Vectorized& src) { return _mm256_castpd_ps(src); } +<<<<<<< HEAD template<> +======= +template <> +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) inline Vectorized cast(const Vectorized& src) { return _mm256_castps_pd(src); } +<<<<<<< HEAD template<> +======= +template <> +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) inline Vectorized cast(const Vectorized& src) { return _mm256_castsi256_ps(src); } +<<<<<<< HEAD template<> inline Vectorized cast(const Vectorized& src) { +======= +template <> +inline Vectorized cast( + const Vectorized& src) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return _mm256_castsi256_pd(src); } // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ GATHER ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #ifndef _MSC_VER // MSVC is not working well on complex function overload. +<<<<<<< HEAD template std::enable_if_t> inline gather(const double* base_addr, const Vectorized& vindex) { @@ -112,12 +164,28 @@ inline gather(const double* base_addr, const Vectorized& vindex) { template std::enable_if_t> inline gather(const float* base_addr, const Vectorized& vindex) { +======= +template +std::enable_if_t< + scale == 1 || scale == 2 || scale == 4 || scale == 8, + Vectorized< + double>> inline gather(const double* base_addr, const Vectorized& vindex) { + return _mm256_i64gather_pd(base_addr, vindex, scale); +} + +template +std::enable_if_t< + scale == 1 || scale == 2 || scale == 4 || scale == 8, + Vectorized< + float>> inline gather(const float* base_addr, const Vectorized& vindex) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return _mm256_i32gather_ps(base_addr, vindex, scale); } #endif // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ MASK GATHER ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #ifndef _MSC_VER // MSVC is not working well on complex function overload. +<<<<<<< HEAD template std::enable_if_t> inline mask_gather(const Vectorized& src, const double* base_addr, @@ -129,6 +197,25 @@ template std::enable_if_t> inline mask_gather(const Vectorized& src, const float* base_addr, const Vectorized& vindex, Vectorized& mask) { +======= +template +std:: + enable_if_t> inline mask_gather( + const Vectorized& src, + const double* base_addr, + const Vectorized& vindex, + Vectorized& mask) { + return _mm256_mask_i64gather_pd(src, base_addr, vindex, mask, scale); +} + +template +std:: + enable_if_t> inline mask_gather( + const Vectorized& src, + const float* base_addr, + const Vectorized& vindex, + Vectorized& mask) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return _mm256_mask_i32gather_ps(src, base_addr, vindex, mask, scale); } #endif @@ -136,6 +223,7 @@ inline mask_gather(const Vectorized& src, const float* base_addr, // Only works for inputs in the range: [-2^51, 2^51] // From: https://stackoverflow.com/a/41148578 +<<<<<<< HEAD template<> Vectorized inline convert_to_int_of_same_size(const Vectorized &src) { @@ -149,10 +237,25 @@ inline convert_to_int_of_same_size(const Vectorized &src) { template<> Vectorized inline convert_to_int_of_same_size(const Vectorized &src) { +======= +template <> +Vectorized inline convert_to_int_of_same_size( + const Vectorized& src) { + auto x = _mm256_add_pd(src, _mm256_set1_pd(0x0018000000000000)); + return _mm256_sub_epi64( + _mm256_castpd_si256(x), + _mm256_castpd_si256(_mm256_set1_pd(0x0018000000000000))); +} + +template <> +Vectorized inline convert_to_int_of_same_size( + const Vectorized& src) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return _mm256_cvttps_epi32(src); } // From: https://stackoverflow.com/a/41148578 +<<<<<<< HEAD template<> Vectorized inline convert_to_fp_of_same_size(const Vectorized &src) { @@ -173,27 +276,69 @@ inline convert_to_fp_of_same_size(const Vectorized &src) { template<> Vectorized inline convert_to_fp_of_same_size(const Vectorized &src) { +======= +template <> +Vectorized inline convert_to_fp_of_same_size( + const Vectorized& src) { + __m256i magic_i_lo = _mm256_set1_epi64x(0x4330000000000000); /* 2^52 */ + __m256i magic_i_hi32 = + _mm256_set1_epi64x(0x4530000080000000); /* 2^84 + 2^63 */ + __m256i magic_i_all = + _mm256_set1_epi64x(0x4530000080100000); /* 2^84 + 2^63 + 2^52 */ + __m256d magic_d_all = _mm256_castsi256_pd(magic_i_all); + + __m256i v_lo = _mm256_blend_epi32( + magic_i_lo, src, 0b01010101); /* v_low = low32 + 2^52 */ + __m256i v_hi = _mm256_srli_epi64(src, 32); + v_hi = _mm256_xor_si256( + v_hi, magic_i_hi32); /* v_hi = high32*2^32 + 2^84 + 2^63 */ + /* int64 = low32 + high32*2^32 = v_hi + v_lo - 2^52 - 2^63 - 2^84 */ + __m256d v_hi_dbl = _mm256_sub_pd(_mm256_castsi256_pd(v_hi), magic_d_all); + __m256d result = _mm256_add_pd(v_hi_dbl, _mm256_castsi256_pd(v_lo)); + return result; +} + +template <> +Vectorized inline convert_to_fp_of_same_size( + const Vectorized& src) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return _mm256_cvtepi32_ps(src); } // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ INTERLEAVE ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ template <> +<<<<<<< HEAD std::pair, Vectorized> inline interleave2(const Vectorized& a, const Vectorized& b) { // inputs: // a = {a0, a1, a3, a3} +======= +std::pair, Vectorized> inline interleave2( + const Vectorized& a, + const Vectorized& b) { + // inputs: + // a = {a0, a1, a2, a3} +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // b = {b0, b1, b2, b3} // swap lanes: // a_swapped = {a0, a1, b0, b1} // b_swapped = {a2, a3, b2, b3} +<<<<<<< HEAD auto a_swapped = _mm256_permute2f128_pd(a, b, 0b0100000); // 0, 2. 4 bits apart auto b_swapped = _mm256_permute2f128_pd(a, b, 0b0110001); // 1, 3. 4 bits apart +======= + auto a_swapped = + _mm256_permute2f128_pd(a, b, 0b0100000); // 0, 2. 4 bits apart + auto b_swapped = + _mm256_permute2f128_pd(a, b, 0b0110001); // 1, 3. 4 bits apart +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // group cols crossing lanes: // return {a0, b0, a1, b1} // {a2, b2, a3, b3} +<<<<<<< HEAD return std::make_pair(_mm256_permute4x64_pd(a_swapped, 0b11011000), // 0, 2, 1, 3 _mm256_permute4x64_pd(b_swapped, 0b11011000)); // 0, 2, 1, 3 } @@ -201,6 +346,17 @@ inline interleave2(const Vectorized& a, const Vectorized template <> std::pair, Vectorized> inline interleave2(const Vectorized& a, const Vectorized& b) { +======= + return std::make_pair( + _mm256_permute4x64_pd(a_swapped, 0b11011000), // 0, 2, 1, 3 + _mm256_permute4x64_pd(b_swapped, 0b11011000)); // 0, 2, 1, 3 +} + +template <> +std::pair, Vectorized> inline interleave2( + const Vectorized& a, + const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // inputs: // a = {a0, a1, a2, a3, a4, a5, a6, a7} // b = {b0, b1, b2, b3, b4, b5, b6, b7} @@ -209,22 +365,41 @@ inline interleave2(const Vectorized& a, const Vectorized& b // a_swapped = {a0, a1, a2, a3, b0, b1, b2, b3} // b_swapped = {a4, a5, a6, a7, b4, b5, b6, b7} // TODO: can we support caching this? +<<<<<<< HEAD auto a_swapped = _mm256_permute2f128_ps(a, b, 0b0100000); // 0, 2. 4 bits apart auto b_swapped = _mm256_permute2f128_ps(a, b, 0b0110001); // 1, 3. 4 bits apart +======= + auto a_swapped = + _mm256_permute2f128_ps(a, b, 0b0100000); // 0, 2. 4 bits apart + auto b_swapped = + _mm256_permute2f128_ps(a, b, 0b0110001); // 1, 3. 4 bits apart +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // group cols crossing lanes: // return {a0, b0, a1, b1, a2, b2, a3, b3} // {a4, b4, a5, b5, a6, b6, a7, b7} const __m256i group_ctrl = _mm256_setr_epi32(0, 4, 1, 5, 2, 6, 3, 7); +<<<<<<< HEAD return std::make_pair(_mm256_permutevar8x32_ps(a_swapped, group_ctrl), _mm256_permutevar8x32_ps(b_swapped, group_ctrl)); +======= + return std::make_pair( + _mm256_permutevar8x32_ps(a_swapped, group_ctrl), + _mm256_permutevar8x32_ps(b_swapped, group_ctrl)); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ DEINTERLEAVE ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ template <> +<<<<<<< HEAD std::pair, Vectorized> inline deinterleave2(const Vectorized& a, const Vectorized& b) { +======= +std::pair, Vectorized> inline deinterleave2( + const Vectorized& a, + const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // inputs: // a = {a0, b0, a1, b1} // b = {a2, b2, a3, b3} @@ -232,12 +407,18 @@ inline deinterleave2(const Vectorized& a, const Vectorized>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // swap lanes: // return {a0, a1, a2, a3} // {b0, b1, b2, b3} +<<<<<<< HEAD return std::make_pair(_mm256_permute2f128_pd(a_grouped, b_grouped, 0b0100000), // 0, 2. 4 bits apart _mm256_permute2f128_pd(a_grouped, b_grouped, 0b0110001)); // 1, 3. 4 bits apart } @@ -245,6 +426,19 @@ inline deinterleave2(const Vectorized& a, const Vectorized std::pair, Vectorized> inline deinterleave2(const Vectorized& a, const Vectorized& b) { +======= + return std::make_pair( + _mm256_permute2f128_pd( + a_grouped, b_grouped, 0b0100000), // 0, 2. 4 bits apart + _mm256_permute2f128_pd( + a_grouped, b_grouped, 0b0110001)); // 1, 3. 4 bits apart +} + +template <> +std::pair, Vectorized> inline deinterleave2( + const Vectorized& a, + const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // inputs: // a = {a0, b0, a1, b1, a2, b2, a3, b3} // b = {a4, b4, a5, b5, a6, b6, a7, b7} @@ -260,18 +454,32 @@ inline deinterleave2(const Vectorized& a, const Vectorized& // swap lanes: // return {a0, a1, a2, a3, a4, a5, a6, a7} // {b0, b1, b2, b3, b4, b5, b6, b7} +<<<<<<< HEAD return std::make_pair(_mm256_permute2f128_ps(a_grouped, b_grouped, 0b0100000), // 0, 2. 4 bits apart _mm256_permute2f128_ps(a_grouped, b_grouped, 0b0110001)); // 1, 3. 4 bits apart +======= + return std::make_pair( + _mm256_permute2f128_ps( + a_grouped, b_grouped, 0b0100000), // 0, 2. 4 bits apart + _mm256_permute2f128_ps( + a_grouped, b_grouped, 0b0110001)); // 1, 3. 4 bits apart +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ FLIP ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +<<<<<<< HEAD template<> inline Vectorized flip(const Vectorized & v) { +======= +template <> +inline Vectorized flip(const Vectorized& v) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) const __m256i mask_float = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7); return _mm256_permutevar8x32_ps(v, mask_float); } +<<<<<<< HEAD template<> inline Vectorized flip(const Vectorized & v) { return _mm256_permute4x64_pd(v, 27); // 27 == _MM_SHUFFLE(0, 1, 2, 3) @@ -284,29 +492,119 @@ inline Vectorized flip(const Vectorized & v) { template<> inline Vectorized flip(const Vectorized & v) { +======= +template <> +inline Vectorized flip(const Vectorized& v) { + return _mm256_permute4x64_pd(v, 27); // 27 == _MM_SHUFFLE(0, 1, 2, 3) +} + +template <> +inline Vectorized flip(const Vectorized& v) { + return _mm256_permute4x64_epi64(v, 27); // 27 == _MM_SHUFFLE(0, 1, 2, 3) +} + +template <> +inline Vectorized flip(const Vectorized& v) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) const __m256i mask_int32 = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7); return _mm256_permutevar8x32_epi32(v, mask_int32); } +<<<<<<< HEAD template<> inline Vectorized flip(const Vectorized & v) { const __m256i mask = _mm256_set_epi8( 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14 ); +======= +template <> +inline Vectorized flip(const Vectorized& v) { + const __m256i mask = _mm256_set_epi8( + 1, + 0, + 3, + 2, + 5, + 4, + 7, + 6, + 9, + 8, + 11, + 10, + 13, + 12, + 15, + 14, + 1, + 0, + 3, + 2, + 5, + 4, + 7, + 6, + 9, + 8, + 11, + 10, + 13, + 12, + 15, + 14); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) auto reversed = _mm256_shuffle_epi8(v, mask); return _mm256_permute2x128_si256(reversed, reversed, 1); } +<<<<<<< HEAD inline __m256i flip8(const __m256i & v) { const __m256i mask_int8 = _mm256_set_epi8( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 ); +======= +inline __m256i flip8(const __m256i& v) { + const __m256i mask_int8 = _mm256_set_epi8( + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) auto reversed = _mm256_shuffle_epi8(v, mask_int8); return _mm256_permute2x128_si256(reversed, reversed, 1); } +<<<<<<< HEAD template<> inline Vectorized flip(const Vectorized & v) { return flip8(v); @@ -314,6 +612,15 @@ inline Vectorized flip(const Vectorized & v) { template<> inline Vectorized flip(const Vectorized & v) { +======= +template <> +inline Vectorized flip(const Vectorized& v) { + return flip8(v); +} + +template <> +inline Vectorized flip(const Vectorized& v) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return flip8(v); } @@ -330,4 +637,9 @@ inline Vectorized operator&&( #endif // (defined(CPU_CAPABILITY_AVX2) +<<<<<<< HEAD }} // namepsace at::vec::CPU_CAPABILITY +======= +} // namespace CPU_CAPABILITY +} // namespace at::vec +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_16bit_float.h b/aten/src/ATen/cpu/vec/vec256/vec256_16bit_float.h index e661f69b40d7..45ec0ba11f4e 100644 --- a/aten/src/ATen/cpu/vec/vec256/vec256_16bit_float.h +++ b/aten/src/ATen/cpu/vec/vec256/vec256_16bit_float.h @@ -3,12 +3,22 @@ // DO NOT DEFINE STATIC DATA IN THIS HEADER! // See Note [Do not compile initializers with AVX] +<<<<<<< HEAD // Used for shared functions and classes for vec256_bfloat16.h and vec256_half.h. // Any functions/classes that are common between those two files should be defined here. // Any non-shared functions/classes should be defined in the respective files. #include #include +======= +// Used for shared functions and classes for vec256_bfloat16.h and +// vec256_half.h. Any functions/classes that are common between those two files +// should be defined here. Any non-shared functions/classes should be defined in +// the respective files. + +#include +#include +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #if defined(CPU_CAPABILITY_AVX2) #define SLEEF_STATIC_LIBS @@ -32,7 +42,10 @@ inline namespace CPU_CAPABILITY { #define SLEEF_CONST_OLD #endif +<<<<<<< HEAD +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // bfloat16 conversion static inline void cvtbf16_fp32(const __m128i& a, __m256& o) { o = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_cvtepu16_epi32(a), 16)); @@ -61,7 +74,12 @@ static inline __m128i cvtfp32_bf16(const __m256& src) { t_value = _mm256_srli_epi32(t_value, 16); // Check NaN before converting back to bf16 t_value = _mm256_blendv_epi8(nan, t_value, mask); +<<<<<<< HEAD t_value = _mm256_packus_epi32(t_value, t_value); // t[4-7] t[4-7] t[0-4] t[0-4] +======= + t_value = + _mm256_packus_epi32(t_value, t_value); // t[4-7] t[4-7] t[0-4] t[0-4] +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) t_value = _mm256_permute4x64_epi64(t_value, 0xd8); // 11 01 10 00 return _mm256_castsi256_si128(t_value); } @@ -90,8 +108,14 @@ static inline __m256i cvtfp32_bf16(const __m256& a, const __m256& b) { t_lo = _mm256_blendv_epi8(nan, t_lo, mask_lo); t_hi = _mm256_blendv_epi8(nan, t_hi, mask_hi); +<<<<<<< HEAD t_lo = _mm256_packus_epi32(t_lo, t_hi); // t_hi[4-7] t_lo[4-7] t_hi[0-4] t_lo[0-4] return _mm256_permute4x64_epi64(t_lo, 0xd8); // 11 01 10 00 +======= + t_lo = _mm256_packus_epi32( + t_lo, t_hi); // t_hi[4-7] t_lo[4-7] t_hi[0-4] t_lo[0-4] + return _mm256_permute4x64_epi64(t_lo, 0xd8); // 11 01 10 00 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } static inline __m256i merge_compare_result(const __m256& a, const __m256& b) { @@ -116,6 +140,7 @@ static inline void cvtfp16_fp32(const __m256i& a, __m256& o1, __m256& o2) { } static inline __m128i cvtfp32_fp16(const __m256& src) { +<<<<<<< HEAD return _mm256_cvtps_ph( src, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)); } @@ -125,10 +150,21 @@ static inline __m256i cvtfp32_fp16(const __m256& a, const __m256& b) { a, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)); __m128i hi = _mm256_cvtps_ph( b, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)); +======= + return _mm256_cvtps_ph(src, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)); +} + +static inline __m256i cvtfp32_fp16(const __m256& a, const __m256& b) { + __m128i lo = + _mm256_cvtps_ph(a, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)); + __m128i hi = + _mm256_cvtps_ph(b, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), hi, 1); } // dtype conversion between float16/bfloat16 and float32 +<<<<<<< HEAD template , int> = 0> inline void cvt_to_fp32(const __m128i& a, __m256& o); template <> inline void cvt_to_fp32(const __m128i& a, __m256& o) { @@ -160,17 +196,77 @@ template <> inline __m256i cvt_from_fp32(const __m256& a, const __m return cvtfp32_fp16(a, b); } template <> inline __m256i cvt_from_fp32(const __m256& a, const __m256& b) { +======= +template < + typename T, + typename std::enable_if_t, int> = 0> +inline void cvt_to_fp32(const __m128i& a, __m256& o); +template <> +inline void cvt_to_fp32(const __m128i& a, __m256& o) { + cvtbf16_fp32(a, o); +} +template <> +inline void cvt_to_fp32(const __m128i& a, __m256& o) { + cvtfp16_fp32(a, o); +} + +template < + typename T, + typename std::enable_if_t, int> = 0> +inline void cvt_to_fp32(const __m256i& a, __m256& o1, __m256& o2); +template <> +inline void cvt_to_fp32(const __m256i& a, __m256& o1, __m256& o2) { + cvtbf16_fp32(a, o1, o2); +} +template <> +inline void cvt_to_fp32(const __m256i& a, __m256& o1, __m256& o2) { + cvtfp16_fp32(a, o1, o2); +} + +template < + typename T, + bool is_compare_op = false, + typename std::enable_if_t, int> = 0> +inline __m256i cvt_from_fp32(const __m256& a, const __m256& b); +template <> +inline __m256i cvt_from_fp32( + const __m256& a, + const __m256& b) { + return cvtfp32_bf16(a, b); +} +template <> +inline __m256i cvt_from_fp32(const __m256& a, const __m256& b) { + return merge_compare_result(a, b); +} +template <> +inline __m256i cvt_from_fp32(const __m256& a, const __m256& b) { + return cvtfp32_fp16(a, b); +} +template <> +inline __m256i cvt_from_fp32(const __m256& a, const __m256& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return cvtfp32_fp16(a, b); } template class Vectorized16 { +<<<<<<< HEAD static_assert( is_reduced_floating_point_v, "Support only float16 and bfloat16."); protected: __m256i values; public: +======= + static_assert( + is_reduced_floating_point_v, + "Support only float16 and bfloat16."); + + protected: + __m256i values; + + public: +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) using value_type = uint16_t; using size_type = int; static constexpr size_type size() { @@ -182,6 +278,7 @@ static_assert( value_type uw = val.x; values = _mm256_set1_epi16(uw); } +<<<<<<< HEAD Vectorized16(T val1, T val2, T val3, T val4, T val5, T val6, T val7, T val8, T val9, T val10, T val11, T val12, @@ -189,14 +286,57 @@ static_assert( values = _mm256_setr_epi16( val1.x, val2.x, val3.x, val4.x, val5.x, val6.x, val7.x, val8.x, val9.x, val10.x, val11.x, val12.x, val13.x, val14.x, val15.x, val16.x); +======= + Vectorized16( + T val1, + T val2, + T val3, + T val4, + T val5, + T val6, + T val7, + T val8, + T val9, + T val10, + T val11, + T val12, + T val13, + T val14, + T val15, + T val16) { + values = _mm256_setr_epi16( + val1.x, + val2.x, + val3.x, + val4.x, + val5.x, + val6.x, + val7.x, + val8.x, + val9.x, + val10.x, + val11.x, + val12.x, + val13.x, + val14.x, + val15.x, + val16.x); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } operator __m256i() const { return values; } T& operator[](int idx) = delete; +<<<<<<< HEAD const T& operator[](int idx) const = delete; int zero_mask() const { // returns an integer mask where all zero elements are translated to 1-bit and others are translated to 0-bit +======= + const T& operator[](int idx) const = delete; + int zero_mask() const { + // returns an integer mask where all zero elements are translated to 1-bit + // and others are translated to 0-bit +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) __m256i cmp = _mm256_cmpeq_epi16(values, _mm256_set1_epi16(0)); return _mm256_movemask_epi8(cmp); } @@ -261,6 +401,7 @@ static_assert( tmp_values[15] = _mm256_extract_epi16(b.values, 15); return loadu(tmp_values); } +<<<<<<< HEAD static Vectorized blendv(const Vectorized& a, const Vectorized& b, const Vectorized& mask) { return _mm256_blendv_epi8(a.values, b.values, mask.values); @@ -275,6 +416,40 @@ static_assert( } static Vectorized set(const Vectorized& a, const Vectorized& b, int64_t count = size()) { +======= + static Vectorized blendv( + const Vectorized& a, + const Vectorized& b, + const Vectorized& mask) { + return _mm256_blendv_epi8(a.values, b.values, mask.values); + } + template + static Vectorized arange( + T base = 0.f, + step_t step = static_cast(1)) { + return Vectorized( + base, + base + step, + base + 2 * step, + base + 3 * step, + base + 4 * step, + base + 5 * step, + base + 6 * step, + base + 7 * step, + base + 8 * step, + base + 9 * step, + base + 10 * step, + base + 11 * step, + base + 12 * step, + base + 13 * step, + base + 14 * step, + base + 15 * step); + } + static Vectorized set( + const Vectorized& a, + const Vectorized& b, + int64_t count = size()) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) switch (count) { case 0: return a; @@ -312,9 +487,16 @@ static_assert( return b; } +<<<<<<< HEAD // 'const' type qualifier on return type has no effect, but sleef defines this this way // For example `Sleef_exp2f8_u10` signature is `const __m256 (__m256)` C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wignored-qualifiers") +======= + // 'const' type qualifier on return type has no effect, but sleef defines this + // this way For example `Sleef_exp2f8_u10` signature is `const __m256 + // (__m256)` + C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wignored-qualifiers") +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) Vectorized map(SLEEF_CONST __m256 (*SLEEF_CONST_OLD vop)(__m256)) const { __m256 lo, hi; cvt_to_fp32(values, lo, hi); @@ -322,7 +504,11 @@ C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wignored-qualifiers") const auto o2 = vop(hi); return cvt_from_fp32(o1, o2); } +<<<<<<< HEAD C10_DIAGNOSTIC_POP() +======= + C10_DIAGNOSTIC_POP() +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) Vectorized isnan() const { __m256 lo, hi; cvt_to_fp32(values, lo, hi); @@ -376,7 +562,11 @@ C10_DIAGNOSTIC_POP() Vectorized atanh() const { return map(Sleef_atanhf8_u10); } +<<<<<<< HEAD Vectorized atan2(const Vectorized &b) const { +======= + Vectorized atan2(const Vectorized& b) const { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) __m256 lo, hi; __m256 b1, b2; cvt_to_fp32(values, lo, hi); @@ -385,12 +575,20 @@ C10_DIAGNOSTIC_POP() auto o2 = Sleef_atan2f8_u10(hi, b2); return cvt_from_fp32(o1, o2); } +<<<<<<< HEAD Vectorized copysign(const Vectorized &sign) const { // copy sign bit (0x8000) from sign and remaining bits from values __m256i mask_value = _mm256_set1_epi32(~0x80008000); __m256i mask_signbit = _mm256_set1_epi32(0x80008000); return Vectorized( _mm256_or_si256( +======= + Vectorized copysign(const Vectorized& sign) const { + // copy sign bit (0x8000) from sign and remaining bits from values + __m256i mask_value = _mm256_set1_epi32(~0x80008000); + __m256i mask_signbit = _mm256_set1_epi32(0x80008000); + return Vectorized(_mm256_or_si256( +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) _mm256_and_si256(values, mask_value), _mm256_and_si256(sign, mask_signbit))); } @@ -426,7 +624,11 @@ C10_DIAGNOSTIC_POP() Vectorized exp_u20() const { return exp(); } +<<<<<<< HEAD Vectorized fmod(const Vectorized & q) const { +======= + Vectorized fmod(const Vectorized& q) const { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) __m256 x_lo, x_hi; cvt_to_fp32(values, x_lo, x_hi); __m256 q_lo, q_hi; @@ -435,7 +637,11 @@ C10_DIAGNOSTIC_POP() auto o2 = Sleef_fmodf8(x_hi, q_hi); return cvt_from_fp32(o1, o2); } +<<<<<<< HEAD Vectorized hypot(const Vectorized &b) const { +======= + Vectorized hypot(const Vectorized& b) const { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) __m256 lo, hi; __m256 b1, b2; cvt_to_fp32(values, lo, hi); @@ -490,7 +696,11 @@ C10_DIAGNOSTIC_POP() const auto o2 = _mm256_loadu_ps(tmp2); return cvt_from_fp32(o1, o2); } +<<<<<<< HEAD Vectorized igamma(const Vectorized &x) const { +======= + Vectorized igamma(const Vectorized& x) const { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) __m256 lo, hi; __m256 xlo, xhi; cvt_to_fp32(values, lo, hi); @@ -510,7 +720,11 @@ C10_DIAGNOSTIC_POP() return cvt_from_fp32(o1, o2); } +<<<<<<< HEAD Vectorized igammac(const Vectorized &x) const { +======= + Vectorized igammac(const Vectorized& x) const { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) __m256 lo, hi; __m256 xlo, xhi; cvt_to_fp32(values, lo, hi); @@ -573,8 +787,15 @@ C10_DIAGNOSTIC_POP() Vectorized round() const { __m256 lo, hi; cvt_to_fp32(values, lo, hi); +<<<<<<< HEAD auto o1 = _mm256_round_ps(lo, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)); auto o2 = _mm256_round_ps(hi, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)); +======= + auto o1 = + _mm256_round_ps(lo, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)); + auto o2 = + _mm256_round_ps(hi, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return cvt_from_fp32(o1, o2); } Vectorized tan() const { @@ -616,7 +837,11 @@ C10_DIAGNOSTIC_POP() auto o2 = _mm256_div_ps(ones, _mm256_sqrt_ps(hi)); return cvt_from_fp32(o1, o2); } +<<<<<<< HEAD Vectorized pow(const Vectorized &b) const { +======= + Vectorized pow(const Vectorized& b) const { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) __m256 lo, hi; __m256 b1, b2; cvt_to_fp32(values, lo, hi); @@ -625,8 +850,14 @@ C10_DIAGNOSTIC_POP() auto o2 = Sleef_powf8_u10(hi, b2); return cvt_from_fp32(o1, o2); } +<<<<<<< HEAD private: template +======= + + private: + template +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) Vectorized inline binary_compare(const VectorizedType& b, Op op) const { __m256 a_lo, a_hi; __m256 b_lo, b_hi; @@ -634,6 +865,7 @@ C10_DIAGNOSTIC_POP() cvt_to_fp32(b.values, b_lo, b_hi); auto o1 = op(a_lo, b_lo); auto o2 = op(a_hi, b_hi); +<<<<<<< HEAD return cvt_from_fp32(o1, o2); } @@ -660,6 +892,49 @@ C10_DIAGNOSTIC_POP() template static inline Vectorized binary_op_as_fp32(const Vectorized& a, const Vectorized& b, Op op) { +======= + return cvt_from_fp32(o1, o2); + } + + public: + Vectorized inline operator>(const Vectorized& other) const { + return binary_compare(other, [](__m256 x, __m256 y) { + return _mm256_cmp_ps(x, y, _CMP_GT_OQ); + }); + } + Vectorized inline operator<(const Vectorized& other) const { + return binary_compare(other, [](__m256 x, __m256 y) { + return _mm256_cmp_ps(x, y, _CMP_LT_OQ); + }); + } + Vectorized inline operator>=(const Vectorized& other) const { + return binary_compare(other, [](__m256 x, __m256 y) { + return _mm256_cmp_ps(x, y, _CMP_GE_OQ); + }); + } + Vectorized inline operator<=(const Vectorized& other) const { + return binary_compare(other, [](__m256 x, __m256 y) { + return _mm256_cmp_ps(x, y, _CMP_LE_OQ); + }); + } + Vectorized inline operator==(const Vectorized16& other) const { + return binary_compare(other, [](__m256 x, __m256 y) { + return _mm256_cmp_ps(x, y, _CMP_EQ_OQ); + }); + } + Vectorized inline operator!=(const Vectorized16& other) const { + return binary_compare(other, [](__m256 x, __m256 y) { + return _mm256_cmp_ps(x, y, _CMP_NEQ_UQ); + }); + } +}; + +template +static inline Vectorized binary_op_as_fp32( + const Vectorized& a, + const Vectorized& b, + Op op) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) __m256 a_lo, a_hi; __m256 b_lo, b_hi; cvt_to_fp32(__m256i(a), a_lo, a_hi); @@ -669,6 +944,7 @@ static inline Vectorized binary_op_as_fp32(const Vectorized& a, const Vect return cvt_from_fp32(o1, o2); } +<<<<<<< HEAD #define CONVERT_VECTORIZED_INIT(type, name) \ inline std::tuple, Vectorized> convert_##name##_float(const Vectorized& a) { \ __m256 o1, o2; \ @@ -735,3 +1011,80 @@ inline void load_fp32_from_##name(const type *data, Vectorized& out1, Vec #endif // CPU_CAPABILITY_AVX2 }} // namespace::at::vec::CPU_CAPABILITY +======= +#define CONVERT_VECTORIZED_INIT(type, name) \ + inline std::tuple, Vectorized> \ + convert_##name##_float(const Vectorized& a) { \ + __m256 o1, o2; \ + cvt_to_fp32(__m256i(a), o1, o2); \ + return std::make_tuple(o1, o2); \ + } \ + inline Vectorized convert_float_##name( \ + const Vectorized& a, const Vectorized& b) { \ + return cvt_from_fp32(__m256(a), __m256(b)); \ + } + +#define LOAD_FP32_VECTORIZED_INIT(type, name) \ + inline void load_fp32_from_##name( \ + const type* data, Vectorized& out) { \ + auto values = _mm_loadu_si128(reinterpret_cast(data)); \ + __m256 out_values; \ + cvt_to_fp32(values, out_values); \ + out = out_values; \ + } \ + \ + inline void load_fp32_from_##name( \ + const type* data, Vectorized& out1, Vectorized& out2) { \ + auto vec = Vectorized::loadu(data); \ + __m256 out1_values, out2_values; \ + cvt_to_fp32(vec, out1_values, out2_values); \ + out1 = out1_values; \ + out2 = out2_values; \ + } + +#else // CPU_CAPABILITY_AVX2 + +#define CONVERT_NON_VECTORIZED_INIT(type, name) \ + inline std::tuple, Vectorized> \ + convert_##name##_float(const Vectorized& a) { \ + constexpr int64_t K = Vectorized::size(); \ + __at_align__ float arr[K]; \ + __at_align__ type arr2[K]; \ + a.store(arr2); \ + convert(arr2, arr, K); \ + return std::make_tuple( \ + Vectorized::loadu(arr), \ + Vectorized::loadu(arr + Vectorized::size())); \ + } \ + inline Vectorized convert_float_##name( \ + const Vectorized& a, const Vectorized& b) { \ + constexpr int64_t K = Vectorized::size(); \ + __at_align__ float arr[K]; \ + __at_align__ type arr2[K]; \ + a.store(arr); \ + b.store(arr + Vectorized::size()); \ + convert(arr, arr2, K); \ + return Vectorized::loadu(arr2); \ + } + +#define LOAD_FP32_NON_VECTORIZED_INIT(type, name) \ + inline void load_fp32_from_##name( \ + const type* data, Vectorized& out) { \ + __at_align__ float values[Vectorized::size()]; \ + for (const auto k : c10::irange(Vectorized::size())) { \ + values[k] = data[k]; \ + } \ + out = Vectorized::loadu(values); \ + } \ + \ + inline void load_fp32_from_##name( \ + const type* data, Vectorized& out1, Vectorized& out2) { \ + load_fp32_from_##name(data, out1); \ + data += Vectorized::size(); \ + load_fp32_from_##name(data, out2); \ + } + +#endif // CPU_CAPABILITY_AVX2 +} // namespace CPU_CAPABILITY +} // namespace at::vec +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_bfloat16.h b/aten/src/ATen/cpu/vec/vec256/vec256_bfloat16.h index ac69e8613f71..d6c6df51b123 100644 --- a/aten/src/ATen/cpu/vec/vec256/vec256_bfloat16.h +++ b/aten/src/ATen/cpu/vec/vec256/vec256_bfloat16.h @@ -13,8 +13,16 @@ inline namespace CPU_CAPABILITY { #if defined(CPU_CAPABILITY_AVX2) template <> +<<<<<<< HEAD class Vectorized: public Vectorized16 { public: +======= +struct is_vec_specialized_for : std::bool_constant {}; + +template <> +class Vectorized : public Vectorized16 { + public: +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) using Vectorized16::Vectorized16; using value_type = BFloat16; @@ -29,6 +37,7 @@ class Vectorized: public Vectorized16 { Vectorized le(const Vectorized& other) const; }; +<<<<<<< HEAD Vectorized inline operator+(const Vectorized& a, const Vectorized& b) { return binary_op_as_fp32(a, b, [](const __m256& x, const __m256& y) { return _mm256_add_ps(x, y); }); } @@ -67,6 +76,74 @@ inline Vectorized Vectorized::lt(const Vectorized& return (*this < other) & Vectorized(1.0f); } inline Vectorized Vectorized::le(const Vectorized& other) const { +======= +Vectorized inline operator+( + const Vectorized& a, + const Vectorized& b) { + return binary_op_as_fp32(a, b, [](const __m256& x, const __m256& y) { + return _mm256_add_ps(x, y); + }); +} +Vectorized inline operator-( + const Vectorized& a, + const Vectorized& b) { + return binary_op_as_fp32(a, b, [](const __m256& x, const __m256& y) { + return _mm256_sub_ps(x, y); + }); +} +Vectorized inline operator*( + const Vectorized& a, + const Vectorized& b) { + return binary_op_as_fp32(a, b, [](const __m256& x, const __m256& y) { + return _mm256_mul_ps(x, y); + }); +} +Vectorized inline operator/( + const Vectorized& a, + const Vectorized& b) { + return binary_op_as_fp32(a, b, [](const __m256& x, const __m256& y) { + return _mm256_div_ps(x, y); + }); +} +Vectorized inline operator&( + const Vectorized& a, + const Vectorized& b) { + return _mm256_and_si256(a, b); +} +Vectorized inline operator|( + const Vectorized& a, + const Vectorized& b) { + return _mm256_or_si256(a, b); +} +Vectorized inline operator^( + const Vectorized& a, + const Vectorized& b) { + return _mm256_xor_si256(a, b); +} + +inline Vectorized Vectorized::eq( + const Vectorized& other) const { + return (*this == other) & Vectorized(1.0f); +} +inline Vectorized Vectorized::ne( + const Vectorized& other) const { + return (*this != other) & Vectorized(1.0f); +} +inline Vectorized Vectorized::gt( + const Vectorized& other) const { + return (*this > other) & Vectorized(1.0f); +} +inline Vectorized Vectorized::ge( + const Vectorized& other) const { + return (*this >= other) & Vectorized(1.0f); +} +inline Vectorized Vectorized::lt( + const Vectorized& other) const { + return (*this < other) & Vectorized(1.0f); +} +inline Vectorized Vectorized::le( + const Vectorized& other) const { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return (*this <= other) & Vectorized(1.0f); } @@ -78,7 +155,13 @@ inline Vectorized Vectorized::frac() const { // Implements the IEEE 754 201X `maximum` operation, which propagates NaN if // either input is a NaN. template <> +<<<<<<< HEAD Vectorized inline maximum(const Vectorized& a, const Vectorized& b) { +======= +Vectorized inline maximum( + const Vectorized& a, + const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) __m256 a_lo, a_hi; __m256 b_lo, b_hi; cvtbf16_fp32(__m256i(a), a_lo, a_hi); @@ -96,7 +179,13 @@ Vectorized inline maximum(const Vectorized& a, const Vectori // Implements the IEEE 754 201X `minimum` operation, which propagates NaN if // either input is a NaN. template <> +<<<<<<< HEAD Vectorized inline minimum(const Vectorized& a, const Vectorized& b) { +======= +Vectorized inline minimum( + const Vectorized& a, + const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) __m256 a_lo, a_hi; __m256 b_lo, b_hi; cvtbf16_fp32(__m256i(a), a_lo, a_hi); @@ -112,8 +201,15 @@ Vectorized inline minimum(const Vectorized& a, const Vectori } template <> +<<<<<<< HEAD Vectorized inline clamp(const Vectorized& a, const Vectorized& min, const Vectorized& max) { +======= +Vectorized inline clamp( + const Vectorized& a, + const Vectorized& min, + const Vectorized& max) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) __m256 a_lo, a_hi; __m256 min_lo, min_hi; __m256 max_lo, max_hi; @@ -126,7 +222,13 @@ Vectorized inline clamp(const Vectorized& a, } template <> +<<<<<<< HEAD Vectorized inline clamp_max(const Vectorized& a, const Vectorized& max) { +======= +Vectorized inline clamp_max( + const Vectorized& a, + const Vectorized& max) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) __m256 a_lo, a_hi; __m256 max_lo, max_hi; cvtbf16_fp32(__m256i(a), a_lo, a_hi); @@ -137,7 +239,13 @@ Vectorized inline clamp_max(const Vectorized& a, const Vecto } template <> +<<<<<<< HEAD Vectorized inline clamp_min(const Vectorized& a, const Vectorized& min) { +======= +Vectorized inline clamp_min( + const Vectorized& a, + const Vectorized& min) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) __m256 a_lo, a_hi; __m256 min_lo, min_hi; cvtbf16_fp32(__m256i(a), a_lo, a_hi); @@ -153,8 +261,15 @@ inline void convert(const BFloat16* src, BFloat16* dst, int64_t n) { #ifndef __msvc_cl__ #pragma unroll #endif +<<<<<<< HEAD for (i = 0; i <= (n - Vectorized::size()); i += Vectorized::size()) { auto vsrc = _mm256_loadu_si256(reinterpret_cast<__m256i*>((void*)(src + i))); +======= + for (i = 0; i <= (n - Vectorized::size()); + i += Vectorized::size()) { + auto vsrc = + _mm256_loadu_si256(reinterpret_cast<__m256i*>((void*)(src + i))); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) _mm256_storeu_si256(reinterpret_cast<__m256i*>((void*)(dst + i)), vsrc); } #ifndef __msvc_cl__ @@ -168,7 +283,12 @@ inline void convert(const BFloat16* src, BFloat16* dst, int64_t n) { template <> inline void convert(const float* src, BFloat16* dst, int64_t n) { int64_t i; +<<<<<<< HEAD for (i = 0; i + Vectorized::size() <= n; i += Vectorized::size()) { +======= + for (i = 0; i + Vectorized::size() <= n; + i += Vectorized::size()) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) __m256 a = _mm256_loadu_ps(&src[i]); __m256 b = _mm256_loadu_ps(&src[i + 8]); @@ -182,7 +302,11 @@ inline void convert(const float* src, BFloat16* dst, int64_t n) { template <> inline void convert(const double* src, BFloat16* dst, int64_t n) { +<<<<<<< HEAD auto load_float = [](const double *src) -> __m256 { +======= + auto load_float = [](const double* src) -> __m256 { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // Load one float vector from an array of doubles __m128 a = _mm256_cvtpd_ps(_mm256_loadu_pd(src)); __m128 b = _mm256_cvtpd_ps(_mm256_loadu_pd(src + 4)); @@ -190,7 +314,12 @@ inline void convert(const double* src, BFloat16* dst, int64_t n) { }; int64_t i; +<<<<<<< HEAD for (i = 0; i + Vectorized::size() <= n; i += Vectorized::size()) { +======= + for (i = 0; i + Vectorized::size() <= n; + i += Vectorized::size()) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) __m256 a = load_float(&src[i]); __m256 b = load_float(&src[i + 8]); @@ -203,8 +332,15 @@ inline void convert(const double* src, BFloat16* dst, int64_t n) { } template <> +<<<<<<< HEAD Vectorized inline fmadd(const Vectorized& a, const Vectorized& b, const Vectorized& c) { +======= +Vectorized inline fmadd( + const Vectorized& a, + const Vectorized& b, + const Vectorized& c) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) __m256 a_lo, a_hi; __m256 b_lo, b_hi; __m256 c_lo, c_hi; @@ -221,10 +357,21 @@ LOAD_FP32_VECTORIZED_INIT(BFloat16, bf16) #else // defined(CPU_CAPABILITY_AVX2) +<<<<<<< HEAD #if !(defined(__aarch64__) && !defined(C10_MOBILE) && !defined(__CUDACC__) && !defined(CPU_CAPABILITY_SVE256)) +======= +#if !( \ + defined(__aarch64__) && !defined(C10_MOBILE) && !defined(__CUDACC__) && \ + !defined(CPU_CAPABILITY_SVE256)) +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) CONVERT_NON_VECTORIZED_INIT(BFloat16, bfloat16) #endif LOAD_FP32_NON_VECTORIZED_INIT(BFloat16, bf16) #endif // defined(CPU_CAPABILITY_AVX2) +<<<<<<< HEAD }} // namsepace at::vec::CPU_CAPABILITY +======= +} // namespace CPU_CAPABILITY +} // namespace at::vec +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_complex_double.h b/aten/src/ATen/cpu/vec/vec256/vec256_complex_double.h index b4d8776d7ae4..155eaf03e4e2 100644 --- a/aten/src/ATen/cpu/vec/vec256/vec256_complex_double.h +++ b/aten/src/ATen/cpu/vec/vec256/vec256_complex_double.h @@ -3,10 +3,17 @@ // DO NOT DEFINE STATIC DATA IN THIS HEADER! // See Note [Do not compile initializers with AVX] +<<<<<<< HEAD #include #include #include #include +======= +#include +#include +#include +#include +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #if defined(CPU_CAPABILITY_AVX2) #define SLEEF_STATIC_LIBS @@ -19,10 +26,23 @@ inline namespace CPU_CAPABILITY { #if defined(CPU_CAPABILITY_AVX2) +<<<<<<< HEAD template <> class Vectorized> { private: __m256d values; public: +======= +template <> +struct is_vec_specialized_for> : std::bool_constant { +}; + +template <> +class Vectorized> { + private: + __m256d values; + + public: +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) using value_type = c10::complex; using size_type = int; static constexpr size_type size() { @@ -33,20 +53,35 @@ template <> class Vectorized> { Vectorized(c10::complex val) { double real_value = val.real(); double imag_value = val.imag(); +<<<<<<< HEAD values = _mm256_setr_pd(real_value, imag_value, real_value, imag_value); } Vectorized(c10::complex val1, c10::complex val2) { values = _mm256_setr_pd(val1.real(), val1.imag(), val2.real(), val2.imag()); +======= + values = _mm256_setr_pd(real_value, imag_value, real_value, imag_value); + } + Vectorized(c10::complex val1, c10::complex val2) { + values = _mm256_setr_pd(val1.real(), val1.imag(), val2.real(), val2.imag()); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } operator __m256d() const { return values; } template +<<<<<<< HEAD static Vectorized> blend(const Vectorized>& a, const Vectorized>& b) { // convert c10::complex index mask to V index mask: xy -> xxyy static_assert (mask > -1 && mask < 4, "Unexpected mask value"); +======= + static Vectorized> blend( + const Vectorized>& a, + const Vectorized>& b) { + // convert c10::complex index mask to V index mask: xy -> xxyy + static_assert(mask > -1 && mask < 4, "Unexpected mask value"); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) switch (mask) { case 0: return a; @@ -54,6 +89,7 @@ template <> class Vectorized> { return _mm256_blend_pd(a.values, b.values, 0x03); case 2: return _mm256_blend_pd(a.values, b.values, 0x0c); +<<<<<<< HEAD case 3: break; } return b; @@ -72,6 +108,31 @@ template <> class Vectorized> { } static Vectorized> set(const Vectorized>& a, const Vectorized>& b, int64_t count = size()) { +======= + case 3: + break; + } + return b; + } + static Vectorized> blendv( + const Vectorized>& a, + const Vectorized>& b, + const Vectorized>& mask) { + // convert c10::complex index mask to V index mask: xy -> xxyy + auto mask_ = _mm256_unpacklo_pd(mask.values, mask.values); + return _mm256_blendv_pd(a.values, b.values, mask_); + } + template + static Vectorized> arange( + c10::complex base = 0., + step_t step = static_cast(1)) { + return Vectorized>(base, base + step); + } + static Vectorized> set( + const Vectorized>& a, + const Vectorized>& b, + int64_t count = size()) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) switch (count) { case 0: return a; @@ -80,6 +141,7 @@ template <> class Vectorized> { } return b; } +<<<<<<< HEAD static Vectorized> loadu(const void* ptr, int64_t count = size()) { if (count == size()) return _mm256_loadu_pd(reinterpret_cast(ptr)); @@ -89,6 +151,20 @@ template <> class Vectorized> { // for more details. We do not initialize arrays to zero using "={0}" because gcc would compile it to two // instructions while a loop would be compiled to one instruction. for (const auto i : c10::irange(2*size())) { +======= + static Vectorized> loadu( + const void* ptr, + int64_t count = size()) { + if (count == size()) + return _mm256_loadu_pd(reinterpret_cast(ptr)); + + __at_align__ double tmp_values[2 * size()]; + // Ensure uninitialized memory does not change the output value See + // https://github.com/pytorch/pytorch/issues/32502 for more details. We do + // not initialize arrays to zero using "={0}" because gcc would compile it + // to two instructions while a loop would be compiled to one instruction. + for (const auto i : c10::irange(2 * size())) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) tmp_values[i] = 0.0; } std::memcpy( @@ -101,14 +177,25 @@ template <> class Vectorized> { if (count == size()) { _mm256_storeu_pd(reinterpret_cast(ptr), values); } else if (count > 0) { +<<<<<<< HEAD double tmp_values[2*size()]; +======= + double tmp_values[2 * size()]; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) _mm256_storeu_pd(reinterpret_cast(tmp_values), values); std::memcpy(ptr, tmp_values, count * sizeof(c10::complex)); } } +<<<<<<< HEAD const c10::complex& operator[](int idx) const = delete; c10::complex& operator[](int idx) = delete; Vectorized> map(c10::complex (*const f)(const c10::complex &)) const { +======= + const c10::complex& operator[](int idx) const = delete; + c10::complex& operator[](int idx) = delete; + Vectorized> map( + c10::complex (*const f)(const c10::complex&)) const { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) __at_align__ c10::complex tmp[size()]; store(tmp); for (const auto i : c10::irange(size())) { @@ -117,6 +204,7 @@ template <> class Vectorized> { return loadu(tmp); } __m256d abs_2_() const { +<<<<<<< HEAD auto val_2 = _mm256_mul_pd(values, values); // a*a b*b return _mm256_hadd_pd(val_2, val_2); // a*a+b*b a*a+b*b } @@ -141,6 +229,38 @@ template <> class Vectorized> { 0xFFFFFFFFFFFFFFFF, 0x0000000000000000)); auto angle = _mm256_permute_pd(angle_(), 0x05); // angle 90-angle return _mm256_and_pd(angle, real_mask); // angle 0 +======= + auto val_2 = _mm256_mul_pd(values, values); // a*a b*b + return _mm256_hadd_pd(val_2, val_2); // a*a+b*b a*a+b*b + } + __m256d abs_() const { + auto real = _mm256_movedup_pd(values); // real real + // movehdup_pd does not exist... + auto imag = _mm256_permute_pd(values, 0xf); // imag imag + return Sleef_hypotd4_u05(real, imag); // abs abs + } + Vectorized> abs() const { + const __m256d real_mask = _mm256_castsi256_pd(_mm256_setr_epi64x( + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000)); + return _mm256_and_pd(abs_(), real_mask); // abs 0 + } + __m256d angle_() const { + // angle = atan2(b/a) + auto b_a = _mm256_permute_pd(values, 0x05); // b a + return Sleef_atan2d4_u10(values, b_a); // 90-angle angle + } + Vectorized> angle() const { + const __m256d real_mask = _mm256_castsi256_pd(_mm256_setr_epi64x( + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000)); + auto angle = _mm256_permute_pd(angle_(), 0x05); // angle 90-angle + return _mm256_and_pd(angle, real_mask); // angle 0 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } Vectorized> sgn() const { auto abs = abs_(); @@ -150,14 +270,23 @@ template <> class Vectorized> { return _mm256_blendv_pd(div, zero, mask); } __m256d real_() const { +<<<<<<< HEAD const __m256d real_mask = _mm256_castsi256_pd(_mm256_setr_epi64x(0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000)); +======= + const __m256d real_mask = _mm256_castsi256_pd(_mm256_setr_epi64x( + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000)); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return _mm256_and_pd(values, real_mask); } Vectorized> real() const { return real_(); } __m256d imag_() const { +<<<<<<< HEAD const __m256d imag_mask = _mm256_castsi256_pd(_mm256_setr_epi64x(0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF)); return _mm256_and_pd(values, imag_mask); @@ -168,12 +297,32 @@ template <> class Vectorized> { __m256d conj_() const { const __m256d sign_mask = _mm256_setr_pd(0.0, -0.0, 0.0, -0.0); return _mm256_xor_pd(values, sign_mask); // a -b +======= + const __m256d imag_mask = _mm256_castsi256_pd(_mm256_setr_epi64x( + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF)); + return _mm256_and_pd(values, imag_mask); + } + Vectorized> imag() const { + return _mm256_permute_pd(imag_(), 0x05); // b a + } + __m256d conj_() const { + const __m256d sign_mask = _mm256_setr_pd(0.0, -0.0, 0.0, -0.0); + return _mm256_xor_pd(values, sign_mask); // a -b +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } Vectorized> conj() const { return conj_(); } Vectorized> log() const { +<<<<<<< HEAD // Most trigonomic ops use the log() op to improve complex number performance. +======= + // Most trigonomic ops use the log() op to improve complex number + // performance. +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return map(std::log); } Vectorized> log2() const { @@ -188,7 +337,12 @@ template <> class Vectorized> { return map(std::log1p); } Vectorized> asin() const { +<<<<<<< HEAD // TODO: The vectorized implementation requires special handling for the case where real number/imag number is 0/Inf/NaN. +======= + // TODO: The vectorized implementation requires special handling for the + // case where real number/imag number is 0/Inf/NaN. +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // // asin(x) // // = -i*ln(iz + sqrt(1 -z^2)) // // = -i*ln((ai - b) + sqrt(1 - (a + bi)*(a + bi))) @@ -196,6 +350,7 @@ template <> class Vectorized> { // const __m256d one = _mm256_set1_pd(1); // auto conj = conj_(); +<<<<<<< HEAD // auto b_a = _mm256_permute_pd(conj, 0x05); //-b a // auto ab = _mm256_mul_pd(conj, b_a); //-ab -ab // auto im = _mm256_add_pd(ab, ab); //-2ab -2ab @@ -207,6 +362,20 @@ template <> class Vectorized> { // auto root = Vectorized(_mm256_blend_pd(re, im, 0x0A)).sqrt(); //sqrt(re + i*im) // auto ln = Vectorized(_mm256_add_pd(b_a, root)).log(); //ln(iz + sqrt()) // return Vectorized(_mm256_permute_pd(ln.values, 0x05)).conj(); //-i*ln() +======= + // auto b_a = _mm256_permute_pd(conj, 0x05); //-b a + // auto ab = _mm256_mul_pd(conj, b_a); //-ab + // -ab auto im = _mm256_add_pd(ab, ab); //-2ab -2ab + + // auto val_2 = _mm256_mul_pd(values, values); // a*a + // b*b auto re = _mm256_hsub_pd(val_2, _mm256_permute_pd(val_2, 0x05)); // + // a*a-b*b b*b-a*a re = _mm256_sub_pd(one, re); + + // auto root = Vectorized(_mm256_blend_pd(re, im, 0x0A)).sqrt(); //sqrt(re + + // i*im) auto ln = Vectorized(_mm256_add_pd(b_a, root)).log(); //ln(iz + + // sqrt()) return Vectorized(_mm256_permute_pd(ln.values, 0x05)).conj(); + // //-i*ln() +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return map(std::asin); } Vectorized> acos() const { @@ -220,6 +389,7 @@ template <> class Vectorized> { return map(std::atanh); } Vectorized> exp() const { +<<<<<<< HEAD // TODO: The vectorized implementation requires special handling for the case where real number/imag number is 0/Inf/NaN. // //exp(a + bi) // // = exp(a)*(cos(b) + sin(b)i) @@ -229,13 +399,32 @@ template <> class Vectorized> { // auto sin_cos = Sleef_sincosd4_u10(values); //[sin(a), cos(a)] [sin(b), cos(b)] // auto cos_sin = _mm256_blend_pd(_mm256_permute_pd(sin_cos.y, 0x05), // sin_cos.x, 0x0A); //cos(b) sin(b) +======= + // TODO: The vectorized implementation requires special handling for the + // case where real number/imag number is 0/Inf/NaN. + // //exp(a + bi) + // // = exp(a)*(cos(b) + sin(b)i) + // auto exp = Sleef_expd4_u10(values); //exp(a) exp(b) exp = + // _mm256_blend_pd(exp, _mm256_permute_pd(exp, 0x05), 0x0A); //exp(a) + // exp(a) + + // auto sin_cos = Sleef_sincosd4_u10(values); //[sin(a), cos(a)] [sin(b), + // cos(b)] auto cos_sin = _mm256_blend_pd(_mm256_permute_pd(sin_cos.y, + // 0x05), + // sin_cos.x, 0x0A); //cos(b) sin(b) +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // return _mm256_mul_pd(exp, cos_sin); return map(std::exp); } Vectorized> exp2() const { // Use identity 2**x = exp(log(2) * x) const __m256d ln_2 = _mm256_set1_pd(c10::ln_2); +<<<<<<< HEAD Vectorized> scaled_values = _mm256_mul_pd(values, ln_2); +======= + Vectorized> scaled_values = + _mm256_mul_pd(values, ln_2); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return scaled_values.exp(); } Vectorized> expm1() const { @@ -264,7 +453,12 @@ template <> class Vectorized> { return _mm256_sub_pd(zero, values); } Vectorized> round() const { +<<<<<<< HEAD return _mm256_round_pd(values, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)); +======= + return _mm256_round_pd( + values, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } Vectorized> tan() const { return map(std::tan); @@ -282,7 +476,12 @@ template <> class Vectorized> { Vectorized> rsqrt() const { return sqrt().reciprocal(); } +<<<<<<< HEAD Vectorized> pow(const Vectorized> &exp) const { +======= + Vectorized> pow( + const Vectorized>& exp) const { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) __at_align__ c10::complex x_tmp[size()]; __at_align__ c10::complex y_tmp[size()]; store(x_tmp); @@ -295,6 +494,7 @@ template <> class Vectorized> { // Comparison using the _CMP_**_OQ predicate. // `O`: get false if an operand is NaN // `Q`: do not raise if an operand is NaN +<<<<<<< HEAD Vectorized> operator==(const Vectorized>& other) const { return _mm256_cmp_pd(values, other.values, _CMP_EQ_OQ); } @@ -341,13 +541,88 @@ template <> Vectorized> inline operator*(const Vectorized Vectorized> inline operator/(const Vectorized> &a, const Vectorized> &b) { // TODO: The vectorized implementation requires special handling for the case where real number/imag number is 0/Inf/NaN. +======= + Vectorized> operator==( + const Vectorized>& other) const { + return _mm256_cmp_pd(values, other.values, _CMP_EQ_OQ); + } + Vectorized> operator!=( + const Vectorized>& other) const { + return _mm256_cmp_pd(values, other.values, _CMP_NEQ_UQ); + } + Vectorized> operator<( + const Vectorized>&) const { + TORCH_CHECK(false, "not supported for complex numbers"); + } + Vectorized> operator<=( + const Vectorized>&) const { + TORCH_CHECK(false, "not supported for complex numbers"); + } + Vectorized> operator>( + const Vectorized>&) const { + TORCH_CHECK(false, "not supported for complex numbers"); + } + Vectorized> operator>=( + const Vectorized>&) const { + TORCH_CHECK(false, "not supported for complex numbers"); + } + + Vectorized> eq( + const Vectorized>& other) const; + Vectorized> ne( + const Vectorized>& other) const; +}; + +template <> +Vectorized> inline operator+( + const Vectorized>& a, + const Vectorized>& b) { + return _mm256_add_pd(a, b); +} + +template <> +Vectorized> inline operator-( + const Vectorized>& a, + const Vectorized>& b) { + return _mm256_sub_pd(a, b); +} + +template <> +Vectorized> inline operator*( + const Vectorized>& a, + const Vectorized>& b) { + //(a + bi) * (c + di) = (ac - bd) + (ad + bc)i + const __m256d sign_mask = _mm256_setr_pd(0.0, -0.0, 0.0, -0.0); + auto ac_bd = _mm256_mul_pd(a, b); // ac bd + + auto d_c = _mm256_permute_pd(b, 0x05); // d c + d_c = _mm256_xor_pd(sign_mask, d_c); // d -c + auto ad_bc = _mm256_mul_pd(a, d_c); // ad -bc + + auto ret = _mm256_hsub_pd(ac_bd, ad_bc); // ac - bd ad + bc + return ret; +} + +template <> +Vectorized> inline operator/( + const Vectorized>& a, + const Vectorized>& b) { + // TODO: The vectorized implementation requires special handling for the case + // where real number/imag number is 0/Inf/NaN. +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // //re + im*i = (a + bi) / (c + di) // auto mask = _mm256_set1_pd(-0.f); // auto fabs_cd = _mm256_andnot_pd(mask, b); // |c| |d| // auto fabs_dc = _mm256_permute_pd(fabs_cd, 0x05); // |d| |c| +<<<<<<< HEAD // auto scale = _mm256_div_pd(_mm256_set1_pd(1.0f), _mm256_max_pd(fabs_cd, fabs_dc)); // 1/sc 1/sc // auto a2 = _mm256_mul_pd(a, scale); // a/sc b/sc // auto b2 = _mm256_mul_pd(b, scale); // c/sc d/sc +======= + // auto scale = _mm256_div_pd(_mm256_set1_pd(1.0f), _mm256_max_pd(fabs_cd, + // fabs_dc)); // 1/sc 1/sc auto a2 = _mm256_mul_pd(a, scale); // + // a/sc b/sc auto b2 = _mm256_mul_pd(b, scale); // c/sc d/sc +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // auto acbd2 = _mm256_mul_pd(a2, b2); // const __m256d sign_mask = _mm256_setr_pd(-0.0, 0.0, -0.0, 0.0); @@ -357,12 +632,24 @@ template <> Vectorized> inline operator/(const Vectorized>(b2).abs_2_(); // (c^2+d^2)/sc^2 (c^2+d^2)/sc^2 // res2 = _mm256_div_pd(res2, denom2); // return res2; __at_align__ c10::complex tmp1[Vectorized>::size()]; __at_align__ c10::complex tmp2[Vectorized>::size()]; __at_align__ c10::complex out[Vectorized>::size()]; +======= + // auto denom2 = Vectorized>(b2).abs_2_(); // + // (c^2+d^2)/sc^2 (c^2+d^2)/sc^2 res2 = _mm256_div_pd(res2, denom2); return + // res2; + __at_align__ c10::complex + tmp1[Vectorized>::size()]; + __at_align__ c10::complex + tmp2[Vectorized>::size()]; + __at_align__ c10::complex + out[Vectorized>::size()]; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) a.store(tmp1); b.store(tmp2); for (const auto i : c10::irange(Vectorized>::size())) { @@ -372,8 +659,15 @@ template <> Vectorized> inline operator/(const Vectorized> Vectorized>::reciprocal() const{ // TODO: The vectorized implementation requires special handling for the case where real number/imag number is 0/Inf/NaN. +======= +inline Vectorized> Vectorized< + c10::complex>::reciprocal() const { + // TODO: The vectorized implementation requires special handling for the case + // where real number/imag number is 0/Inf/NaN. +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // //re + im*i = (a + bi) / (c + di) // //re = (ac + bd)/abs_2() = c/abs_2() // //im = (bc - ad)/abs_2() = d/abs_2() @@ -388,21 +682,41 @@ inline Vectorized> Vectorized>::recipr return loadu(tmp); } +<<<<<<< HEAD inline Vectorized> Vectorized>::atan() const { // TODO: The vectorized implementation requires special handling for the case where real number/imag number is 0/Inf/NaN. +======= +inline Vectorized> Vectorized>::atan() + const { + // TODO: The vectorized implementation requires special handling for the case + // where real number/imag number is 0/Inf/NaN. +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // // atan(x) = i/2 * ln((i + z)/(i - z)) // const __m256d i = _mm256_setr_pd(0.0, 1.0, 0.0, 1.0); // const Vectorized i_half = _mm256_setr_pd(0.0, 0.5, 0.0, 0.5); +<<<<<<< HEAD // auto sum = Vectorized(_mm256_add_pd(i, values)); // a 1+b // auto sub = Vectorized(_mm256_sub_pd(i, values)); // -a 1-b // auto ln = (sum/sub).log(); // ln((i + z)/(i - z)) // return i_half*ln; // i/2*ln() +======= + // auto sum = Vectorized(_mm256_add_pd(i, values)); // a + // 1+b auto sub = Vectorized(_mm256_sub_pd(i, values)); // -a 1-b auto + // ln = (sum/sub).log(); // ln((i + + // z)/(i - z)) return i_half*ln; // i/2*ln() +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return map(std::atan); } template <> +<<<<<<< HEAD Vectorized> inline maximum(const Vectorized>& a, const Vectorized>& b) { +======= +Vectorized> inline maximum( + const Vectorized>& a, + const Vectorized>& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) auto abs_a = a.abs_2_(); auto abs_b = b.abs_2_(); auto mask = _mm256_cmp_pd(abs_a, abs_b, _CMP_LT_OQ); @@ -413,7 +727,13 @@ Vectorized> inline maximum(const Vectorized +<<<<<<< HEAD Vectorized> inline minimum(const Vectorized>& a, const Vectorized>& b) { +======= +Vectorized> inline minimum( + const Vectorized>& a, + const Vectorized>& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) auto abs_a = a.abs_2_(); auto abs_b = b.abs_2_(); auto mask = _mm256_cmp_pd(abs_a, abs_b, _CMP_GT_OQ); @@ -424,16 +744,29 @@ Vectorized> inline minimum(const Vectorized +<<<<<<< HEAD Vectorized> inline operator&(const Vectorized>& a, const Vectorized>& b) { +======= +Vectorized> inline operator&( + const Vectorized>& a, + const Vectorized>& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return _mm256_and_pd(a, b); } template <> +<<<<<<< HEAD Vectorized> inline operator|(const Vectorized>& a, const Vectorized>& b) { +======= +Vectorized> inline operator|( + const Vectorized>& a, + const Vectorized>& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return _mm256_or_pd(a, b); } template <> +<<<<<<< HEAD Vectorized> inline operator^(const Vectorized>& a, const Vectorized>& b) { return _mm256_xor_pd(a, b); } @@ -448,8 +781,37 @@ inline Vectorized> Vectorized>::ne(con auto ne = (*this != other); // compares real and imag individually // If either real numbers or imag numbers are not equal, then the complex numbers are not equal return (ne.real() | ne.imag()) & Vectorized>(_mm256_set1_pd(1.0)); +======= +Vectorized> inline operator^( + const Vectorized>& a, + const Vectorized>& b) { + return _mm256_xor_pd(a, b); +} + +inline Vectorized> Vectorized>::eq( + const Vectorized>& other) const { + auto eq = (*this == other); // compares real and imag individually + // If both real numbers and imag numbers are equal, then the complex numbers + // are equal + return (eq.real() & eq.imag()) & + Vectorized>(_mm256_set1_pd(1.0)); +} + +inline Vectorized> Vectorized>::ne( + const Vectorized>& other) const { + auto ne = (*this != other); // compares real and imag individually + // If either real numbers or imag numbers are not equal, then the complex + // numbers are not equal + return (ne.real() | ne.imag()) & + Vectorized>(_mm256_set1_pd(1.0)); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } #endif +<<<<<<< HEAD }} // namespace at::vec::CPU_CAPABILITY +======= +} // namespace CPU_CAPABILITY +} // namespace at::vec +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_complex_float.h b/aten/src/ATen/cpu/vec/vec256/vec256_complex_float.h index bec9490c7554..1ac0865043ee 100644 --- a/aten/src/ATen/cpu/vec/vec256/vec256_complex_float.h +++ b/aten/src/ATen/cpu/vec/vec256/vec256_complex_float.h @@ -3,10 +3,17 @@ // DO NOT DEFINE STATIC DATA IN THIS HEADER! // See Note [Do not compile initializers with AVX] +<<<<<<< HEAD #include #include #include #include +======= +#include +#include +#include +#include +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #if defined(CPU_CAPABILITY_AVX2) #define SLEEF_STATIC_LIBS #include @@ -18,10 +25,23 @@ inline namespace CPU_CAPABILITY { #if defined(CPU_CAPABILITY_AVX2) +<<<<<<< HEAD template <> class Vectorized> { private: __m256 values; public: +======= +template <> +struct is_vec_specialized_for> : std::bool_constant { +}; + +template <> +class Vectorized> { + private: + __m256 values; + + public: +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) using value_type = c10::complex; using size_type = int; static constexpr size_type size() { @@ -32,6 +52,7 @@ template <> class Vectorized> { Vectorized(c10::complex val) { float real_value = val.real(); float imag_value = val.imag(); +<<<<<<< HEAD values = _mm256_setr_ps(real_value, imag_value, real_value, imag_value, real_value, imag_value, @@ -44,18 +65,52 @@ template <> class Vectorized> { val3.real(), val3.imag(), val4.real(), val4.imag() ); +======= + values = _mm256_setr_ps( + real_value, + imag_value, + real_value, + imag_value, + real_value, + imag_value, + real_value, + imag_value); + } + Vectorized( + c10::complex val1, + c10::complex val2, + c10::complex val3, + c10::complex val4) { + values = _mm256_setr_ps( + val1.real(), + val1.imag(), + val2.real(), + val2.imag(), + val3.real(), + val3.imag(), + val4.real(), + val4.imag()); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } operator __m256() const { return values; } template +<<<<<<< HEAD static Vectorized> blend(const Vectorized>& a, const Vectorized>& b) { // convert c10::complex index mask to V index mask: xy -> xxyy +======= + static Vectorized> blend( + const Vectorized>& a, + const Vectorized>& b) { + // convert c10::complex index mask to V index mask: xy -> xxyy +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) static_assert(mask > -1 && mask < 16, "Unexpected mask range"); switch (mask) { case 0: return a; case 1: +<<<<<<< HEAD return _mm256_blend_ps(a.values, b.values, 0x03); //b0000 0001 = b0000 0011 case 2: return _mm256_blend_ps(a.values, b.values, 0x0C); //b0000 0010 = b0000 1100 @@ -103,6 +158,76 @@ template <> class Vectorized> { } static Vectorized> set(const Vectorized>& a, const Vectorized>& b, int64_t count = size()) { +======= + return _mm256_blend_ps( + a.values, b.values, 0x03); // b0000 0001 = b0000 0011 + case 2: + return _mm256_blend_ps( + a.values, b.values, 0x0C); // b0000 0010 = b0000 1100 + case 3: + return _mm256_blend_ps( + a.values, b.values, 0x0F); // b0000 0011 = b0000 1111 + case 4: + return _mm256_blend_ps( + a.values, b.values, 0x30); // b0000 0100 = b0011 0000 + case 5: + return _mm256_blend_ps( + a.values, b.values, 0x33); // b0000 0101 = b0011 0011 + case 6: + return _mm256_blend_ps( + a.values, b.values, 0x3C); // b0000 0110 = b0011 1100 + case 7: + return _mm256_blend_ps( + a.values, b.values, 0x3F); // b0000 0111 = b0011 1111 + case 8: + return _mm256_blend_ps( + a.values, b.values, 0xC0); // b0000 1000 = b1100 0000 + case 9: + return _mm256_blend_ps( + a.values, b.values, 0xC3); // b0000 1001 = b1100 0011 + case 10: + return _mm256_blend_ps( + a.values, b.values, 0xCC); // b0000 1010 = b1100 1100 + case 11: + return _mm256_blend_ps( + a.values, b.values, 0xCF); // b0000 1011 = b1100 1111 + case 12: + return _mm256_blend_ps( + a.values, b.values, 0xF0); // b0000 1100 = b1111 0000 + case 13: + return _mm256_blend_ps( + a.values, b.values, 0xF3); // b0000 1101 = b1111 0011 + case 14: + return _mm256_blend_ps( + a.values, b.values, 0xFC); // b0000 1110 = b1111 1100 + default: + break; + } + return b; + } + static Vectorized> blendv( + const Vectorized>& a, + const Vectorized>& b, + const Vectorized>& mask) { + // convert c10::complex index mask to V index mask: xy -> xxyy + auto mask_ = _mm256_unpacklo_ps(mask.values, mask.values); + return _mm256_blendv_ps(a.values, b.values, mask_); + } + template + static Vectorized> arange( + c10::complex base = 0., + step_t step = static_cast(1)) { + return Vectorized>( + base, + base + step, + base + c10::complex(2) * step, + base + c10::complex(3) * step); + } + static Vectorized> set( + const Vectorized>& a, + const Vectorized>& b, + int64_t count = size()) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) switch (count) { case 0: return a; @@ -115,6 +240,7 @@ template <> class Vectorized> { } return b; } +<<<<<<< HEAD static Vectorized> loadu(const void* ptr, int64_t count = size()) { if (count == size()) return _mm256_loadu_ps(reinterpret_cast(ptr)); @@ -124,6 +250,20 @@ template <> class Vectorized> { // for more details. We do not initialize arrays to zero using "={0}" because gcc would compile it to two // instructions while a loop would be compiled to one instruction. for (const auto i : c10::irange(2*size())) { +======= + static Vectorized> loadu( + const void* ptr, + int64_t count = size()) { + if (count == size()) + return _mm256_loadu_ps(reinterpret_cast(ptr)); + + __at_align__ float tmp_values[2 * size()]; + // Ensure uninitialized memory does not change the output value See + // https://github.com/pytorch/pytorch/issues/32502 for more details. We do + // not initialize arrays to zero using "={0}" because gcc would compile it + // to two instructions while a loop would be compiled to one instruction. + for (const auto i : c10::irange(2 * size())) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) tmp_values[i] = 0.0; } std::memcpy( @@ -136,14 +276,25 @@ template <> class Vectorized> { if (count == size()) { _mm256_storeu_ps(reinterpret_cast(ptr), values); } else if (count > 0) { +<<<<<<< HEAD float tmp_values[2*size()]; +======= + float tmp_values[2 * size()]; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) _mm256_storeu_ps(reinterpret_cast(tmp_values), values); std::memcpy(ptr, tmp_values, count * sizeof(c10::complex)); } } +<<<<<<< HEAD const c10::complex& operator[](int idx) const = delete; c10::complex& operator[](int idx) = delete; Vectorized> map(c10::complex (*const f)(const c10::complex &)) const { +======= + const c10::complex& operator[](int idx) const = delete; + c10::complex& operator[](int idx) = delete; + Vectorized> map( + c10::complex (*const f)(const c10::complex&)) const { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) __at_align__ c10::complex tmp[size()]; store(tmp); for (const auto i : c10::irange(size())) { @@ -152,6 +303,7 @@ template <> class Vectorized> { return loadu(tmp); } __m256 abs_2_() const { +<<<<<<< HEAD auto val_2 = _mm256_mul_ps(values, values); // a*a b*b auto ret = _mm256_hadd_ps(val_2, val_2); // a*a+b*b a*a+b*b return _mm256_permute_ps(ret, 0xD8); @@ -176,6 +328,46 @@ template <> class Vectorized> { 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000)); auto angle = _mm256_permute_ps(angle_(), 0xB1); // angle 90-angle return _mm256_and_ps(angle, real_mask); // angle 0 +======= + auto val_2 = _mm256_mul_ps(values, values); // a*a b*b + auto ret = _mm256_hadd_ps(val_2, val_2); // a*a+b*b a*a+b*b + return _mm256_permute_ps(ret, 0xD8); + } + __m256 abs_() const { + auto real = _mm256_moveldup_ps(values); // real real + auto imag = _mm256_movehdup_ps(values); // imag imag + return Sleef_hypotf8_u05(real, imag); // abs abs + } + Vectorized> abs() const { + const __m256 real_mask = _mm256_castsi256_ps(_mm256_setr_epi32( + 0xFFFFFFFF, + 0x00000000, + 0xFFFFFFFF, + 0x00000000, + 0xFFFFFFFF, + 0x00000000, + 0xFFFFFFFF, + 0x00000000)); + return _mm256_and_ps(abs_(), real_mask); // abs 0 + } + __m256 angle_() const { + // angle = atan2(b/a) + auto b_a = _mm256_permute_ps(values, 0xB1); // b a + return Sleef_atan2f8_u10(values, b_a); // 90-angle angle + } + Vectorized> angle() const { + const __m256 real_mask = _mm256_castsi256_ps(_mm256_setr_epi32( + 0xFFFFFFFF, + 0x00000000, + 0xFFFFFFFF, + 0x00000000, + 0xFFFFFFFF, + 0x00000000, + 0xFFFFFFFF, + 0x00000000)); + auto angle = _mm256_permute_ps(angle_(), 0xB1); // angle 90-angle + return _mm256_and_ps(angle, real_mask); // angle 0 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } Vectorized> sgn() const { auto abs = abs_(); @@ -185,14 +377,27 @@ template <> class Vectorized> { return _mm256_blendv_ps(div, zero, mask); } __m256 real_() const { +<<<<<<< HEAD const __m256 real_mask = _mm256_castsi256_ps(_mm256_setr_epi32(0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000)); +======= + const __m256 real_mask = _mm256_castsi256_ps(_mm256_setr_epi32( + 0xFFFFFFFF, + 0x00000000, + 0xFFFFFFFF, + 0x00000000, + 0xFFFFFFFF, + 0x00000000, + 0xFFFFFFFF, + 0x00000000)); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return _mm256_and_ps(values, real_mask); } Vectorized> real() const { return real_(); } __m256 imag_() const { +<<<<<<< HEAD const __m256 imag_mask = _mm256_castsi256_ps(_mm256_setr_epi32(0x00000000, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF)); return _mm256_and_ps(values, imag_mask); @@ -203,12 +408,37 @@ template <> class Vectorized> { __m256 conj_() const { const __m256 sign_mask = _mm256_setr_ps(0.0, -0.0, 0.0, -0.0, 0.0, -0.0, 0.0, -0.0); return _mm256_xor_ps(values, sign_mask); // a -b +======= + const __m256 imag_mask = _mm256_castsi256_ps(_mm256_setr_epi32( + 0x00000000, + 0xFFFFFFFF, + 0x00000000, + 0xFFFFFFFF, + 0x00000000, + 0xFFFFFFFF, + 0x00000000, + 0xFFFFFFFF)); + return _mm256_and_ps(values, imag_mask); + } + Vectorized> imag() const { + return _mm256_permute_ps(imag_(), 0xB1); // b a + } + __m256 conj_() const { + const __m256 sign_mask = + _mm256_setr_ps(0.0, -0.0, 0.0, -0.0, 0.0, -0.0, 0.0, -0.0); + return _mm256_xor_ps(values, sign_mask); // a -b +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } Vectorized> conj() const { return conj_(); } Vectorized> log() const { +<<<<<<< HEAD // Most trigonomic ops use the log() op to improve complex number performance. +======= + // Most trigonomic ops use the log() op to improve complex number + // performance. +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return map(std::log); } Vectorized> log2() const { @@ -223,7 +453,12 @@ template <> class Vectorized> { return map(std::log1p); } Vectorized> asin() const { +<<<<<<< HEAD // TODO: The vectorized implementation requires special handling for the case where real number/imag number is 0/Inf/NaN. +======= + // TODO: The vectorized implementation requires special handling for the + // case where real number/imag number is 0/Inf/NaN. +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // // asin(x) // // = -i*ln(iz + sqrt(1 -z^2)) // // = -i*ln((ai - b) + sqrt(1 - (a + bi)*(a + bi))) @@ -231,6 +466,7 @@ template <> class Vectorized> { // const __m256 one = _mm256_set1_ps(1); // auto conj = conj_(); +<<<<<<< HEAD // auto b_a = _mm256_permute_ps(conj, 0xB1); //-b a // auto ab = _mm256_mul_ps(conj, b_a); //-ab -ab // auto im = _mm256_add_ps(ab, ab); //-2ab -2ab @@ -243,6 +479,21 @@ template <> class Vectorized> { // auto root = Vectorized(_mm256_blend_ps(re, im, 0xAA)).sqrt(); //sqrt(re + i*im) // auto ln = Vectorized(_mm256_add_ps(b_a, root)).log(); //ln(iz + sqrt()) // return Vectorized(_mm256_permute_ps(ln.values, 0xB1)).conj(); //-i*ln() +======= + // auto b_a = _mm256_permute_ps(conj, 0xB1); //-b a + // auto ab = _mm256_mul_ps(conj, b_a); //-ab + // -ab auto im = _mm256_add_ps(ab, ab); //-2ab -2ab + + // auto val_2 = _mm256_mul_ps(values, values); // a*a + // b*b auto re = _mm256_hsub_ps(val_2, _mm256_permute_ps(val_2, 0xB1)); // + // a*a-b*b b*b-a*a re = _mm256_permute_ps(re, 0xD8); re = + // _mm256_sub_ps(one, re); + + // auto root = Vectorized(_mm256_blend_ps(re, im, 0xAA)).sqrt(); //sqrt(re + + // i*im) auto ln = Vectorized(_mm256_add_ps(b_a, root)).log(); //ln(iz + + // sqrt()) return Vectorized(_mm256_permute_ps(ln.values, 0xB1)).conj(); + // //-i*ln() +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return map(std::asin); } Vectorized> acos() const { @@ -253,6 +504,7 @@ template <> class Vectorized> { return map(std::atanh); } Vectorized> exp() const { +<<<<<<< HEAD // TODO: The vectorized implementation requires special handling for the case where real number/imag number is 0/Inf/NaN. // //exp(a + bi) // // = exp(a)*(cos(b) + sin(b)i) @@ -262,6 +514,20 @@ template <> class Vectorized> { // auto sin_cos = Sleef_sincosf8_u10(values); //[sin(a), cos(a)] [sin(b), cos(b)] // auto cos_sin = _mm256_blend_ps(_mm256_permute_ps(sin_cos.y, 0xB1), // sin_cos.x, 0xAA); //cos(b) sin(b) +======= + // TODO: The vectorized implementation requires special handling for the + // case where real number/imag number is 0/Inf/NaN. + // //exp(a + bi) + // // = exp(a)*(cos(b) + sin(b)i) + // auto exp = Sleef_expf8_u10(values); //exp(a) exp(b) exp = + // _mm256_blend_ps(exp, _mm256_permute_ps(exp, 0xB1), 0xAA); //exp(a) + // exp(a) + + // auto sin_cos = Sleef_sincosf8_u10(values); //[sin(a), cos(a)] [sin(b), + // cos(b)] auto cos_sin = _mm256_blend_ps(_mm256_permute_ps(sin_cos.y, + // 0xB1), + // sin_cos.x, 0xAA); //cos(b) sin(b) +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // return _mm256_mul_ps(exp, cos_sin); return map(std::exp); } @@ -297,7 +563,12 @@ template <> class Vectorized> { return _mm256_sub_ps(zero, values); } Vectorized> round() const { +<<<<<<< HEAD return _mm256_round_ps(values, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)); +======= + return _mm256_round_ps( + values, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } Vectorized> tan() const { return map(std::tan); @@ -315,7 +586,12 @@ template <> class Vectorized> { Vectorized> rsqrt() const { return sqrt().reciprocal(); } +<<<<<<< HEAD Vectorized> pow(const Vectorized> &exp) const { +======= + Vectorized> pow( + const Vectorized>& exp) const { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) __at_align__ c10::complex x_tmp[size()]; __at_align__ c10::complex y_tmp[size()]; store(x_tmp); @@ -328,6 +604,7 @@ template <> class Vectorized> { // Comparison using the _CMP_**_OQ predicate. // `O`: get false if an operand is NaN // `Q`: do not raise if an operand is NaN +<<<<<<< HEAD Vectorized> operator==(const Vectorized>& other) const { return _mm256_cmp_ps(values, other.values, _CMP_EQ_OQ); } @@ -369,34 +646,123 @@ template <> Vectorized> inline operator*(const Vectorized> operator==( + const Vectorized>& other) const { + return _mm256_cmp_ps(values, other.values, _CMP_EQ_OQ); + } + Vectorized> operator!=( + const Vectorized>& other) const { + return _mm256_cmp_ps(values, other.values, _CMP_NEQ_UQ); + } + Vectorized> operator<( + const Vectorized>& /*other*/) const { + TORCH_CHECK(false, "not supported for complex numbers"); + } + Vectorized> operator<=( + const Vectorized>& /*other*/) const { + TORCH_CHECK(false, "not supported for complex numbers"); + } + Vectorized> operator>( + const Vectorized>& /*other*/) const { + TORCH_CHECK(false, "not supported for complex numbers"); + } + Vectorized> operator>=( + const Vectorized>& /*other*/) const { + TORCH_CHECK(false, "not supported for complex numbers"); + } + + Vectorized> eq( + const Vectorized>& other) const; + Vectorized> ne( + const Vectorized>& other) const; +}; + +template <> +Vectorized> inline operator+( + const Vectorized>& a, + const Vectorized>& b) { + return _mm256_add_ps(a, b); +} + +template <> +Vectorized> inline operator-( + const Vectorized>& a, + const Vectorized>& b) { + return _mm256_sub_ps(a, b); +} + +template <> +Vectorized> inline operator*( + const Vectorized>& a, + const Vectorized>& b) { + //(a + bi) * (c + di) = (ac - bd) + (ad + bc)i + const __m256 sign_mask = + _mm256_setr_ps(0.0, -0.0, 0.0, -0.0, 0.0, -0.0, 0.0, -0.0); + auto ac_bd = _mm256_mul_ps(a, b); // ac bd + + auto d_c = _mm256_permute_ps(b, 0xB1); // d c + d_c = _mm256_xor_ps(sign_mask, d_c); // d -c + auto ad_bc = _mm256_mul_ps(a, d_c); // ad -bc + + auto ret = _mm256_hsub_ps(ac_bd, ad_bc); // ac - bd ad + bc +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ret = _mm256_permute_ps(ret, 0xD8); return ret; } +<<<<<<< HEAD template <> Vectorized> inline operator/(const Vectorized> &a, const Vectorized> &b) { // TODO: The vectorized implementation requires special handling for the case where real number/imag number is 0/Inf/NaN. +======= +template <> +Vectorized> inline operator/( + const Vectorized>& a, + const Vectorized>& b) { + // TODO: The vectorized implementation requires special handling for the case + // where real number/imag number is 0/Inf/NaN. +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // //re + im*i = (a + bi) / (c + di) // auto mask = _mm256_set1_ps(-0.f); // auto fabs_cd = _mm256_andnot_ps(mask, b); // |c| |d| // auto fabs_dc = _mm256_permute_ps(fabs_cd, 0xB1); // |d| |c| +<<<<<<< HEAD // auto scale = _mm256_rcp_ps(_mm256_max_ps(fabs_cd, fabs_dc)); // 1/sc 1/sc +======= + // auto scale = _mm256_rcp_ps(_mm256_max_ps(fabs_cd, fabs_dc)); // 1/sc 1/sc +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // auto a2 = _mm256_mul_ps(a, scale); // a/sc b/sc // auto b2 = _mm256_mul_ps(b, scale); // c/sc d/sc // auto acbd2 = _mm256_mul_ps(a2, b2); +<<<<<<< HEAD // const __m256 sign_mask = _mm256_setr_ps(-0.0, 0.0, -0.0, 0.0, -0.0, 0.0, -0.0, 0.0); // auto dc2 = _mm256_permute_ps(b2, 0xB1); // d/sc c/sc +======= + // const __m256 sign_mask = _mm256_setr_ps(-0.0, 0.0, -0.0, 0.0, -0.0, 0.0, + // -0.0, 0.0); auto dc2 = _mm256_permute_ps(b2, 0xB1); // d/sc c/sc +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // dc2 = _mm256_xor_ps(sign_mask, dc2); // -d/|c,d| c/sc // auto adbc2 = _mm256_mul_ps(a2, dc2); //-ad/sc^2 bc/sc^2 // auto res2 = _mm256_hadd_ps(acbd2, adbc2); //(ac+bd)/sc^2 (bc-ad)/sc^2 // res2 = _mm256_permute_ps(res2, 0xD8); // // get the denominator +<<<<<<< HEAD // auto denom2 = Vectorized>(b2).abs_2_(); // (c^2+d^2)/sc^2 (c^2+d^2)/sc^2 // res2 = _mm256_div_ps(res2, denom2); // return res2; __at_align__ c10::complex tmp1[Vectorized>::size()]; __at_align__ c10::complex tmp2[Vectorized>::size()]; +======= + // auto denom2 = Vectorized>(b2).abs_2_(); // + // (c^2+d^2)/sc^2 (c^2+d^2)/sc^2 res2 = _mm256_div_ps(res2, denom2); return + // res2; + __at_align__ c10::complex + tmp1[Vectorized>::size()]; + __at_align__ c10::complex + tmp2[Vectorized>::size()]; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) __at_align__ c10::complex out[Vectorized>::size()]; a.store(tmp1); b.store(tmp2); @@ -407,6 +773,7 @@ template <> Vectorized> inline operator/(const Vectorized> Vectorized>::reciprocal() const { // TODO: The vectorized implementation requires special handling for the case where real number/imag number is 0/Inf/NaN. // //re + im*i = (a + bi) / (c + di) @@ -414,6 +781,17 @@ inline Vectorized> Vectorized>::reciproc // //im = (bc - ad)/abs_2() = d/abs_2() // const __m256 sign_mask = _mm256_setr_ps(0.0, -0.0, 0.0, -0.0, 0.0, -0.0, 0.0, -0.0); // auto c_d = _mm256_xor_ps(sign_mask, values); //c -d +======= +inline Vectorized> Vectorized< + c10::complex>::reciprocal() const { + // TODO: The vectorized implementation requires special handling for the case + // where real number/imag number is 0/Inf/NaN. + // //re + im*i = (a + bi) / (c + di) + // //re = (ac + bd)/abs_2() = c/abs_2() + // //im = (bc - ad)/abs_2() = d/abs_2() + // const __m256 sign_mask = _mm256_setr_ps(0.0, -0.0, 0.0, -0.0, 0.0, -0.0, + // 0.0, -0.0); auto c_d = _mm256_xor_ps(sign_mask, values); //c -d +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // return _mm256_div_ps(c_d, abs_2_()); __at_align__ c10::complex tmp[size()]; store(tmp); @@ -423,6 +801,7 @@ inline Vectorized> Vectorized>::reciproc return loadu(tmp); } +<<<<<<< HEAD inline Vectorized> Vectorized>::atan() const { // TODO: The vectorized implementation requires special handling for the case where real number/imag number is 0/Inf/NaN. // // atan(x) = i/2 * ln((i + z)/(i - z)) @@ -433,11 +812,32 @@ inline Vectorized> Vectorized>::atan() c // auto sub = Vectorized(_mm256_sub_ps(i, values)); // -a 1-b // auto ln = (sum/sub).log(); // ln((i + z)/(i - z)) // return i_half*ln; // i/2*ln() +======= +inline Vectorized> Vectorized>::atan() + const { + // TODO: The vectorized implementation requires special handling for the case + // where real number/imag number is 0/Inf/NaN. + // // atan(x) = i/2 * ln((i + z)/(i - z)) + // const __m256 i = _mm256_setr_ps(0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0); + // const Vectorized i_half = _mm256_setr_ps(0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, + // 0.5); + + // auto sum = Vectorized(_mm256_add_ps(i, values)); // a + // 1+b auto sub = Vectorized(_mm256_sub_ps(i, values)); // -a 1-b auto + // ln = (sum/sub).log(); // ln((i + + // z)/(i - z)) return i_half*ln; // i/2*ln() +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return map(std::atan); } template <> +<<<<<<< HEAD Vectorized> inline maximum(const Vectorized>& a, const Vectorized>& b) { +======= +Vectorized> inline maximum( + const Vectorized>& a, + const Vectorized>& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) auto abs_a = a.abs_2_(); auto abs_b = b.abs_2_(); auto mask = _mm256_cmp_ps(abs_a, abs_b, _CMP_LT_OQ); @@ -448,7 +848,13 @@ Vectorized> inline maximum(const Vectorized +<<<<<<< HEAD Vectorized> inline minimum(const Vectorized>& a, const Vectorized>& b) { +======= +Vectorized> inline minimum( + const Vectorized>& a, + const Vectorized>& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) auto abs_a = a.abs_2_(); auto abs_b = b.abs_2_(); auto mask = _mm256_cmp_ps(abs_a, abs_b, _CMP_GT_OQ); @@ -459,34 +865,73 @@ Vectorized> inline minimum(const Vectorized +<<<<<<< HEAD Vectorized> inline operator&(const Vectorized>& a, const Vectorized>& b) { +======= +Vectorized> inline operator&( + const Vectorized>& a, + const Vectorized>& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return _mm256_and_ps(a, b); } template <> +<<<<<<< HEAD Vectorized> inline operator|(const Vectorized>& a, const Vectorized>& b) { +======= +Vectorized> inline operator|( + const Vectorized>& a, + const Vectorized>& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return _mm256_or_ps(a, b); } template <> +<<<<<<< HEAD Vectorized> inline operator^(const Vectorized>& a, const Vectorized>& b) { +======= +Vectorized> inline operator^( + const Vectorized>& a, + const Vectorized>& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return _mm256_xor_ps(a, b); } inline Vectorized> Vectorized>::eq( const Vectorized>& other) const { +<<<<<<< HEAD auto eq = (*this == other); // compares real and imag individually // If both real numbers and imag numbers are equal, then the complex numbers are equal return (eq.real() & eq.imag()) & Vectorized>(_mm256_set1_ps(1.0f)); +======= + auto eq = (*this == other); // compares real and imag individually + // If both real numbers and imag numbers are equal, then the complex numbers + // are equal + return (eq.real() & eq.imag()) & + Vectorized>(_mm256_set1_ps(1.0f)); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } inline Vectorized> Vectorized>::ne( const Vectorized>& other) const { +<<<<<<< HEAD auto ne = (*this != other); // compares real and imag individually // If either real numbers or imag numbers are not equal, then the complex numbers are not equal return (ne.real() | ne.imag()) & Vectorized>(_mm256_set1_ps(1.0f)); +======= + auto ne = (*this != other); // compares real and imag individually + // If either real numbers or imag numbers are not equal, then the complex + // numbers are not equal + return (ne.real() | ne.imag()) & + Vectorized>(_mm256_set1_ps(1.0f)); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } #endif +<<<<<<< HEAD }} // namespace at::vec::CPU_CAPABILITY +======= +} // namespace CPU_CAPABILITY +} // namespace at::vec +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_convert.h b/aten/src/ATen/cpu/vec/vec256/vec256_convert.h index 9dbdb4f3dfb2..9517b11d7a89 100644 --- a/aten/src/ATen/cpu/vec/vec256/vec256_convert.h +++ b/aten/src/ATen/cpu/vec/vec256/vec256_convert.h @@ -1,6 +1,10 @@ #pragma once +<<<<<<< HEAD #include +======= +#include +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #include #include #include @@ -117,6 +121,7 @@ struct VecConvert { src.store(buffer); at::vec::VectorizedN result; result[0] = Vectorized( +<<<<<<< HEAD static_cast(buffer[0]), static_cast(buffer[1]), static_cast(buffer[2]), @@ -126,6 +131,17 @@ struct VecConvert { static_cast(buffer[5]), static_cast(buffer[6]), static_cast(buffer[7])); +======= + static_cast(buffer[0]), + static_cast(buffer[1]), + static_cast(buffer[2]), + static_cast(buffer[3])); + result[1] = Vectorized( + static_cast(buffer[4]), + static_cast(buffer[5]), + static_cast(buffer[6]), + static_cast(buffer[7])); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return result; } }; @@ -171,12 +187,19 @@ struct VecConvert { } }; +<<<<<<< HEAD +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) template <> struct VecConvert { static inline VectorizedN apply( const VectorizedN& src) { +<<<<<<< HEAD return Vectorized(_mm256_cvttps_epi32(src[0])); +======= + return Vectorized(_mm256_cvttps_epi32(src[0])); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } }; @@ -184,7 +207,11 @@ template <> struct VecConvert { static inline VectorizedN apply( const VectorizedN& src) { +<<<<<<< HEAD return Vectorized(_mm256_cvtepi32_ps(src[0])); +======= + return Vectorized(_mm256_cvtepi32_ps(src[0])); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } }; @@ -219,15 +246,24 @@ struct VecConvert< 1, float, 2, +<<<<<<< HEAD typename std::enable_if_t, void>> { +======= + typename std::enable_if_t, void>> { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) static inline VectorizedN apply(const VectorizedN& src) { at::vec::Vectorized vec1 = convert_float_to_int8(src[0]); at::vec::Vectorized vec2 = convert_float_to_int8(src[1]); __m128 lane2 = _mm256_castps256_ps128(_mm256_castsi256_ps(vec2)); __m256 combined = _mm256_insertf128_ps(_mm256_castsi256_ps(vec1), lane2, 1); // Shuffle [191:128] bit from combined in to [127:64] bit of result +<<<<<<< HEAD __m256i result = _mm256_permute4x64_epi64(_mm256_castps_si256(combined), 0b11011000); +======= + __m256i result = + _mm256_permute4x64_epi64(_mm256_castps_si256(combined), 0b11011000); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return at::vec::Vectorized(result); } }; @@ -238,8 +274,12 @@ struct VecConvert< 1, float, 1, +<<<<<<< HEAD typename std::enable_if_t, void>> { +======= + typename std::enable_if_t, void>> { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) static inline VectorizedN apply(const VectorizedN& src) { return convert_float_to_int8(src[0]); } @@ -251,6 +291,7 @@ struct VecConvert< 2, src_t, 1, +<<<<<<< HEAD typename std::enable_if_t, void>> { static inline VectorizedN apply(const VectorizedN& src) { @@ -262,6 +303,19 @@ struct VecConvert< ) ); return VectorizedN(convert_int8_to_float(src[0]), convert_int8_to_float(src2)); +======= + typename std::enable_if_t, void>> { + static inline VectorizedN apply(const VectorizedN& src) { + // Shuffle [127:64] bit from src[0] in to [191:128] bit of shuffled + __m256i shuffled = _mm256_permute4x64_epi64(src[0], 0b11011000); + __m256i src2 = + _mm256_castsi128_si256(_mm_castps_si128(_mm256_extractf128_ps( + _mm256_castsi256_ps(shuffled), 1) // Extract the second 128-bit lane + )); + return VectorizedN( + convert_int8_to_float(src[0]), + convert_int8_to_float(src2)); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } }; @@ -272,8 +326,12 @@ struct VecConvert< int64_t, 2, std::enable_if_t< +<<<<<<< HEAD std::is_same_v || std::is_same_v>> { +======= + std::is_same_v || std::is_same_v>> { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) static inline VectorizedN apply( const VectorizedN& src) { return VecConvert::apply( @@ -283,7 +341,10 @@ struct VecConvert< #endif /* defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) */ +<<<<<<< HEAD +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #if (defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)) template struct VecConvert< @@ -291,14 +352,63 @@ struct VecConvert< 1, src_t, 1, +<<<<<<< HEAD typename std::enable_if_t, void>> { +======= + typename std::enable_if_t, void>> { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) static inline VectorizedN apply(const VectorizedN& src) { return convert_int8_to_float(src[0]); } }; #endif +<<<<<<< HEAD +======= +#if defined(CPU_CAPABILITY_SVE256) && defined(__ARM_FEATURE_BF16) + +template <> +struct VecConvert { + static inline VectorizedN apply( + const VectorizedN& src) { + VectorizedN res; + // Load 16-bit unsigned integers from src into an SVE vector + svuint16_t u16x4 = + svld1_u16(svptrue_b16(), reinterpret_cast(&src[0])); + // Zero-extend to 32-bit SVE does not have direct vmovl_u16 equivalent. + vls_uint32_t u32x4 = + svreinterpret_u32_u16(svzip1_u16(svdup_n_u16(0), u16x4)); + // Reinterpret as float32 + vls_float32_t f32x4 = svreinterpret_f32_u32(u32x4); + res[0] = Vectorized(f32x4); + return res; + } +}; + +template <> +struct VecConvert { + static inline VectorizedN apply( + const VectorizedN& src) { + VectorizedN res; + std::tie(res[0], res[1]) = convert_bfloat16_float(src[0]); + return res; + } +}; + +template <> +struct VecConvert { + static inline VectorizedN apply( + const VectorizedN& src) { + VectorizedN res; + res[0] = convert_float_bfloat16(src[0], src[1]); + return res; + } +}; + +#endif // defined(CPU_CAPABILITY_SVE256) && defined(__ARM_FEATURE_BF16) + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) template struct VecConvert< float, diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_double.h b/aten/src/ATen/cpu/vec/vec256/vec256_double.h index b4b878859cbb..e38c983c017c 100644 --- a/aten/src/ATen/cpu/vec/vec256/vec256_double.h +++ b/aten/src/ATen/cpu/vec/vec256/vec256_double.h @@ -15,6 +15,7 @@ namespace at::vec { // See Note [CPU_CAPABILITY namespace] inline namespace CPU_CAPABILITY { +<<<<<<< HEAD #if defined(CPU_CAPABILITY_AVX2) @@ -22,6 +23,19 @@ template <> class Vectorized { private: __m256d values; public: +======= +#if defined(CPU_CAPABILITY_AVX2) + +template <> +struct is_vec_specialized_for : std::bool_constant {}; + +template <> +class Vectorized { + private: + __m256d values; + + public: +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) using value_type = double; using size_type = int; static constexpr size_type size() { @@ -39,6 +53,7 @@ template <> class Vectorized { return values; } template +<<<<<<< HEAD static Vectorized blend(const Vectorized& a, const Vectorized& b) { return _mm256_blend_pd(a.values, b.values, mask); } @@ -52,6 +67,30 @@ template <> class Vectorized { } static Vectorized set(const Vectorized& a, const Vectorized& b, int64_t count = size()) { +======= + static Vectorized blend( + const Vectorized& a, + const Vectorized& b) { + return _mm256_blend_pd(a.values, b.values, mask); + } + static Vectorized blendv( + const Vectorized& a, + const Vectorized& b, + const Vectorized& mask) { + return _mm256_blendv_pd(a.values, b.values, mask.values); + } + template + static Vectorized arange( + double base = 0., + step_t step = static_cast(1)) { + return Vectorized( + base, base + step, base + 2 * step, base + 3 * step); + } + static Vectorized set( + const Vectorized& a, + const Vectorized& b, + int64_t count = size()) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) switch (count) { case 0: return a; @@ -68,11 +107,19 @@ template <> class Vectorized { if (count == size()) return _mm256_loadu_pd(reinterpret_cast(ptr)); +<<<<<<< HEAD __at_align__ double tmp_values[size()]; // Ensure uninitialized memory does not change the output value See https://github.com/pytorch/pytorch/issues/32502 // for more details. We do not initialize arrays to zero using "={0}" because gcc would compile it to two // instructions while a loop would be compiled to one instruction. +======= + __at_align__ double tmp_values[size()]; + // Ensure uninitialized memory does not change the output value See + // https://github.com/pytorch/pytorch/issues/32502 for more details. We do + // not initialize arrays to zero using "={0}" because gcc would compile it + // to two instructions while a loop would be compiled to one instruction. +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) for (const auto i : c10::irange(size())) { tmp_values[i] = 0.0; } @@ -91,10 +138,18 @@ template <> class Vectorized { std::memcpy(ptr, tmp_values, count * sizeof(double)); } } +<<<<<<< HEAD const double& operator[](int idx) const = delete; double& operator[](int idx) = delete; int zero_mask() const { // returns an integer mask where all zero elements are translated to 1-bit and others are translated to 0-bit +======= + const double& operator[](int idx) const = delete; + double& operator[](int idx) = delete; + int zero_mask() const { + // returns an integer mask where all zero elements are translated to 1-bit + // and others are translated to 0-bit +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) __m256d cmp = _mm256_cmp_pd(values, _mm256_set1_pd(0.0), _CMP_EQ_OQ); return _mm256_movemask_pd(cmp); } @@ -102,8 +157,14 @@ template <> class Vectorized { return _mm256_cmp_pd(values, _mm256_set1_pd(0.0), _CMP_UNORD_Q); } bool has_inf_nan() const { +<<<<<<< HEAD __m256d self_sub = _mm256_sub_pd(values, values); return (_mm256_movemask_epi8(_mm256_castpd_si256(self_sub)) & 0x77777777) != 0; +======= + __m256d self_sub = _mm256_sub_pd(values, values); + return (_mm256_movemask_epi8(_mm256_castpd_si256(self_sub)) & 0x77777777) != + 0; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } Vectorized map(double (*const f)(double)) const { __at_align__ double tmp[size()]; @@ -156,10 +217,17 @@ template <> class Vectorized { Vectorized atanh() const { return Vectorized(Sleef_atanhd4_u10(values)); } +<<<<<<< HEAD Vectorized atan2(const Vectorized &b) const { return Vectorized(Sleef_atan2d4_u10(values, b)); } Vectorized copysign(const Vectorized &sign) const { +======= + Vectorized atan2(const Vectorized& b) const { + return Vectorized(Sleef_atan2d4_u10(values, b)); + } + Vectorized copysign(const Vectorized& sign) const { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return Vectorized(Sleef_copysignd4(values, sign)); } Vectorized erf() const { @@ -186,7 +254,11 @@ template <> class Vectorized { Vectorized fmod(const Vectorized& q) const { return Vectorized(Sleef_fmodd4(values, q)); } +<<<<<<< HEAD Vectorized hypot(const Vectorized &b) const { +======= + Vectorized hypot(const Vectorized& b) const { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return Vectorized(Sleef_hypotd4_u05(values, b)); } Vectorized i0() const { @@ -198,7 +270,11 @@ template <> class Vectorized { Vectorized digamma() const { return map(calc_digamma); } +<<<<<<< HEAD Vectorized igamma(const Vectorized &x) const { +======= + Vectorized igamma(const Vectorized& x) const { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) __at_align__ double tmp[size()]; __at_align__ double tmp_x[size()]; store(tmp); @@ -208,7 +284,11 @@ template <> class Vectorized { } return loadu(tmp); } +<<<<<<< HEAD Vectorized igammac(const Vectorized &x) const { +======= + Vectorized igammac(const Vectorized& x) const { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) __at_align__ double tmp[size()]; __at_align__ double tmp_x[size()]; store(tmp); @@ -252,11 +332,20 @@ template <> class Vectorized { Vectorized neg() const { return _mm256_xor_pd(_mm256_set1_pd(-0.), values); } +<<<<<<< HEAD Vectorized nextafter(const Vectorized &b) const { return Vectorized(Sleef_nextafterd4(values, b)); } Vectorized round() const { return _mm256_round_pd(values, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)); +======= + Vectorized nextafter(const Vectorized& b) const { + return Vectorized(Sleef_nextafterd4(values, b)); + } + Vectorized round() const { + return _mm256_round_pd( + values, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } Vectorized tan() const { return Vectorized(Sleef_tand4_u10(values)); @@ -279,7 +368,11 @@ template <> class Vectorized { Vectorized rsqrt() const { return _mm256_div_pd(_mm256_set1_pd(1), _mm256_sqrt_pd(values)); } +<<<<<<< HEAD Vectorized pow(const Vectorized &b) const { +======= + Vectorized pow(const Vectorized& b) const { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return Vectorized(Sleef_powd4_u10(values, b)); } // Comparison using the _CMP_**_OQ predicate. @@ -318,22 +411,46 @@ template <> class Vectorized { }; template <> +<<<<<<< HEAD Vectorized inline operator+(const Vectorized& a, const Vectorized& b) { +======= +Vectorized inline operator+( + const Vectorized& a, + const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return _mm256_add_pd(a, b); } template <> +<<<<<<< HEAD Vectorized inline operator-(const Vectorized& a, const Vectorized& b) { +======= +Vectorized inline operator-( + const Vectorized& a, + const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return _mm256_sub_pd(a, b); } template <> +<<<<<<< HEAD Vectorized inline operator*(const Vectorized& a, const Vectorized& b) { +======= +Vectorized inline operator*( + const Vectorized& a, + const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return _mm256_mul_pd(a, b); } template <> +<<<<<<< HEAD Vectorized inline operator/(const Vectorized& a, const Vectorized& b) { +======= +Vectorized inline operator/( + const Vectorized& a, + const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return _mm256_div_pd(a, b); } @@ -345,7 +462,13 @@ inline Vectorized Vectorized::frac() const { // Implements the IEEE 754 201X `maximum` operation, which propagates NaN if // either input is a NaN. template <> +<<<<<<< HEAD Vectorized inline maximum(const Vectorized& a, const Vectorized& b) { +======= +Vectorized inline maximum( + const Vectorized& a, + const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) Vectorized max = _mm256_max_pd(a, b); Vectorized isnan = _mm256_cmp_pd(a, b, _CMP_UNORD_Q); // Exploit the fact that all-ones is a NaN. @@ -355,7 +478,13 @@ Vectorized inline maximum(const Vectorized& a, const Vectorized< // Implements the IEEE 754 201X `minimum` operation, which propagates NaN if // either input is a NaN. template <> +<<<<<<< HEAD Vectorized inline minimum(const Vectorized& a, const Vectorized& b) { +======= +Vectorized inline minimum( + const Vectorized& a, + const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) Vectorized min = _mm256_min_pd(a, b); Vectorized isnan = _mm256_cmp_pd(a, b, _CMP_UNORD_Q); // Exploit the fact that all-ones is a NaN. @@ -363,31 +492,63 @@ Vectorized inline minimum(const Vectorized& a, const Vectorized< } template <> +<<<<<<< HEAD Vectorized inline clamp(const Vectorized& a, const Vectorized& min, const Vectorized& max) { +======= +Vectorized inline clamp( + const Vectorized& a, + const Vectorized& min, + const Vectorized& max) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return _mm256_min_pd(max, _mm256_max_pd(min, a)); } template <> +<<<<<<< HEAD Vectorized inline clamp_min(const Vectorized& a, const Vectorized& min) { +======= +Vectorized inline clamp_min( + const Vectorized& a, + const Vectorized& min) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return _mm256_max_pd(min, a); } template <> +<<<<<<< HEAD Vectorized inline clamp_max(const Vectorized& a, const Vectorized& max) { +======= +Vectorized inline clamp_max( + const Vectorized& a, + const Vectorized& max) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return _mm256_min_pd(max, a); } template <> +<<<<<<< HEAD Vectorized inline operator&(const Vectorized& a, const Vectorized& b) { +======= +Vectorized inline operator&( + const Vectorized& a, + const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return _mm256_and_pd(a, b); } template <> +<<<<<<< HEAD Vectorized inline operator|(const Vectorized& a, const Vectorized& b) { +======= +Vectorized inline operator|( + const Vectorized& a, + const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return _mm256_or_pd(a, b); } template <> +<<<<<<< HEAD Vectorized inline operator^(const Vectorized& a, const Vectorized& b) { return _mm256_xor_pd(a, b); } @@ -413,6 +574,41 @@ inline Vectorized Vectorized::lt(const Vectorized& other } inline Vectorized Vectorized::le(const Vectorized& other) const { +======= +Vectorized inline operator^( + const Vectorized& a, + const Vectorized& b) { + return _mm256_xor_pd(a, b); +} + +inline Vectorized Vectorized::eq( + const Vectorized& other) const { + return (*this == other) & Vectorized(1.0); +} + +inline Vectorized Vectorized::ne( + const Vectorized& other) const { + return (*this != other) & Vectorized(1.0); +} + +inline Vectorized Vectorized::gt( + const Vectorized& other) const { + return (*this > other) & Vectorized(1.0); +} + +inline Vectorized Vectorized::ge( + const Vectorized& other) const { + return (*this >= other) & Vectorized(1.0); +} + +inline Vectorized Vectorized::lt( + const Vectorized& other) const { + return (*this < other) & Vectorized(1.0); +} + +inline Vectorized Vectorized::le( + const Vectorized& other) const { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return (*this <= other) & Vectorized(1.0); } @@ -422,7 +618,12 @@ inline void convert(const double* src, double* dst, int64_t n) { #ifndef __msvc_cl__ #pragma unroll #endif +<<<<<<< HEAD for (i = 0; i <= (n - Vectorized::size()); i += Vectorized::size()) { +======= + for (i = 0; i <= (n - Vectorized::size()); + i += Vectorized::size()) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) _mm256_storeu_pd(dst + i, _mm256_loadu_pd(src + i)); } #ifndef __msvc_cl__ @@ -435,16 +636,35 @@ inline void convert(const double* src, double* dst, int64_t n) { #ifdef CPU_CAPABILITY_AVX2 template <> +<<<<<<< HEAD Vectorized inline fmadd(const Vectorized& a, const Vectorized& b, const Vectorized& c) { +======= +Vectorized inline fmadd( + const Vectorized& a, + const Vectorized& b, + const Vectorized& c) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return _mm256_fmadd_pd(a, b, c); } template <> +<<<<<<< HEAD Vectorized inline fmsub(const Vectorized& a, const Vectorized& b, const Vectorized& c) { +======= +Vectorized inline fmsub( + const Vectorized& a, + const Vectorized& b, + const Vectorized& c) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return _mm256_fmsub_pd(a, b, c); } #endif #endif +<<<<<<< HEAD }} // namespace at::vec::CPU_CAPABILITY +======= +} // namespace CPU_CAPABILITY +} // namespace at::vec +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_float.h b/aten/src/ATen/cpu/vec/vec256/vec256_float.h index d57c28cfdbdc..4bb854314825 100644 --- a/aten/src/ATen/cpu/vec/vec256/vec256_float.h +++ b/aten/src/ATen/cpu/vec/vec256/vec256_float.h @@ -17,10 +17,22 @@ inline namespace CPU_CAPABILITY { #if defined(CPU_CAPABILITY_AVX2) +<<<<<<< HEAD template <> class Vectorized { private: __m256 values; public: +======= +template <> +struct is_vec_specialized_for : std::bool_constant {}; + +template <> +class Vectorized { + private: + __m256 values; + + public: +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) using value_type = float; using size_type = int; static constexpr size_type size() { @@ -31,16 +43,41 @@ template <> class Vectorized { Vectorized(float val) { values = _mm256_set1_ps(val); } +<<<<<<< HEAD Vectorized(float val1, float val2, float val3, float val4, float val5, float val6, float val7, float val8) { values = _mm256_setr_ps(val1, val2, val3, val4, val5, val6, val7, val8); } Vectorized(const float (&arr)[8]) : Vectorized(arr[0], arr[1], arr[2], arr[3], arr[4], arr[5], arr[6], arr[7]) {} +======= + Vectorized( + float val1, + float val2, + float val3, + float val4, + float val5, + float val6, + float val7, + float val8) { + values = _mm256_setr_ps(val1, val2, val3, val4, val5, val6, val7, val8); + } + Vectorized(const float (&arr)[8]) + : Vectorized( + arr[0], + arr[1], + arr[2], + arr[3], + arr[4], + arr[5], + arr[6], + arr[7]) {} +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) operator __m256() const { return values; } template +<<<<<<< HEAD static Vectorized blend(const Vectorized& a, const Vectorized& b) { return _mm256_blend_ps(a.values, b.values, mask); } @@ -56,6 +93,37 @@ template <> class Vectorized { } static Vectorized set(const Vectorized& a, const Vectorized& b, int64_t count = size()) { +======= + static Vectorized blend( + const Vectorized& a, + const Vectorized& b) { + return _mm256_blend_ps(a.values, b.values, mask); + } + static Vectorized blendv( + const Vectorized& a, + const Vectorized& b, + const Vectorized& mask) { + return _mm256_blendv_ps(a.values, b.values, mask.values); + } + template + static Vectorized arange( + float base = 0.f, + step_t step = static_cast(1)) { + return Vectorized( + base, + base + step, + base + 2 * step, + base + 3 * step, + base + 4 * step, + base + 5 * step, + base + 6 * step, + base + 7 * step); + } + static Vectorized set( + const Vectorized& a, + const Vectorized& b, + int64_t count = size()) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) switch (count) { case 0: return a; @@ -80,9 +148,16 @@ template <> class Vectorized { if (count == size()) return _mm256_loadu_ps(reinterpret_cast(ptr)); __at_align__ float tmp_values[size()]; +<<<<<<< HEAD // Ensure uninitialized memory does not change the output value See https://github.com/pytorch/pytorch/issues/32502 // for more details. We do not initialize arrays to zero using "={0}" because gcc would compile it to two // instructions while a loop would be compiled to one instruction. +======= + // Ensure uninitialized memory does not change the output value See + // https://github.com/pytorch/pytorch/issues/32502 for more details. We do + // not initialize arrays to zero using "={0}" because gcc would compile it + // to two instructions while a loop would be compiled to one instruction. +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) for (const auto i : c10::irange(size())) { tmp_values[i] = 0.0; } @@ -99,10 +174,18 @@ template <> class Vectorized { std::memcpy(ptr, tmp_values, count * sizeof(float)); } } +<<<<<<< HEAD const float& operator[](int idx) const = delete; float& operator[](int idx) = delete; int zero_mask() const { // returns an integer mask where all zero elements are translated to 1-bit and others are translated to 0-bit +======= + const float& operator[](int idx) const = delete; + float& operator[](int idx) = delete; + int zero_mask() const { + // returns an integer mask where all zero elements are translated to 1-bit + // and others are translated to 0-bit +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) __m256 cmp = _mm256_cmp_ps(values, _mm256_set1_ps(0.0f), _CMP_EQ_OQ); return _mm256_movemask_ps(cmp); } @@ -111,8 +194,14 @@ template <> class Vectorized { } bool has_inf_nan() const { +<<<<<<< HEAD __m256 self_sub = _mm256_sub_ps(values, values); return (_mm256_movemask_epi8(_mm256_castps_si256(self_sub)) & 0x77777777) != 0; +======= + __m256 self_sub = _mm256_sub_ps(values, values); + return (_mm256_movemask_epi8(_mm256_castps_si256(self_sub)) & 0x77777777) != + 0; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } Vectorized map(float (*const f)(float)) const { @@ -166,10 +255,17 @@ template <> class Vectorized { Vectorized atanh() const { return Vectorized(Sleef_atanhf8_u10(values)); } +<<<<<<< HEAD Vectorized atan2(const Vectorized &b) const { return Vectorized(Sleef_atan2f8_u10(values, b)); } Vectorized copysign(const Vectorized &sign) const { +======= + Vectorized atan2(const Vectorized& b) const { + return Vectorized(Sleef_atan2f8_u10(values, b)); + } + Vectorized copysign(const Vectorized& sign) const { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return Vectorized(Sleef_copysignf8(values, sign)); } Vectorized erf() const { @@ -237,9 +333,18 @@ template <> class Vectorized { const __m256 vec_one = _mm256_set1_ps(1.f); const __m256 vec_zero = _mm256_set1_ps(0.f); const __m256 vec_two = _mm256_set1_ps(2.f); +<<<<<<< HEAD const __m256 vec_ln2f = _mm256_castsi256_ps(_mm256_set1_epi32(0x3f317218)); // ln(2) const __m256 vec_ln_flt_min = _mm256_castsi256_ps(_mm256_set1_epi32(0xc2aeac50)); const __m256 vec_ln_flt_max = _mm256_castsi256_ps(_mm256_set1_epi32(0x42b17218)); +======= + const __m256 vec_ln2f = + _mm256_castsi256_ps(_mm256_set1_epi32(0x3f317218)); // ln(2) + const __m256 vec_ln_flt_min = + _mm256_castsi256_ps(_mm256_set1_epi32(0xc2aeac50)); + const __m256 vec_ln_flt_max = + _mm256_castsi256_ps(_mm256_set1_epi32(0x42b17218)); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) const __m256i vec_127 = _mm256_set1_epi32(0x0000007f); const int n_mantissa_bits = 23; @@ -315,7 +420,11 @@ template <> class Vectorized { Vectorized floor() const { return _mm256_floor_ps(values); } +<<<<<<< HEAD Vectorized hypot(const Vectorized &b) const { +======= + Vectorized hypot(const Vectorized& b) const { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return Vectorized(Sleef_hypotf8_u05(values, b)); } Vectorized i0() const { @@ -327,7 +436,11 @@ template <> class Vectorized { Vectorized digamma() const { return map(calc_digamma); } +<<<<<<< HEAD Vectorized igamma(const Vectorized &x) const { +======= + Vectorized igamma(const Vectorized& x) const { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) __at_align__ float tmp[size()]; __at_align__ float tmp_x[size()]; store(tmp); @@ -337,7 +450,11 @@ template <> class Vectorized { } return loadu(tmp); } +<<<<<<< HEAD Vectorized igammac(const Vectorized &x) const { +======= + Vectorized igammac(const Vectorized& x) const { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) __at_align__ float tmp[size()]; __at_align__ float tmp_x[size()]; store(tmp); @@ -350,11 +467,20 @@ template <> class Vectorized { Vectorized neg() const { return _mm256_xor_ps(_mm256_set1_ps(-0.f), values); } +<<<<<<< HEAD Vectorized nextafter(const Vectorized &b) const { return Vectorized(Sleef_nextafterf8(values, b)); } Vectorized round() const { return _mm256_round_ps(values, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)); +======= + Vectorized nextafter(const Vectorized& b) const { + return Vectorized(Sleef_nextafterf8(values, b)); + } + Vectorized round() const { + return _mm256_round_ps( + values, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } Vectorized tan() const { return Vectorized(Sleef_tanf8_u10(values)); @@ -377,7 +503,11 @@ template <> class Vectorized { Vectorized rsqrt() const { return _mm256_div_ps(_mm256_set1_ps(1), _mm256_sqrt_ps(values)); } +<<<<<<< HEAD Vectorized pow(const Vectorized &b) const { +======= + Vectorized pow(const Vectorized& b) const { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return Vectorized(Sleef_powf8_u10(values, b)); } float reduce_add() const { @@ -442,22 +572,46 @@ template <> class Vectorized { }; template <> +<<<<<<< HEAD Vectorized inline operator+(const Vectorized& a, const Vectorized& b) { +======= +Vectorized inline operator+( + const Vectorized& a, + const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return _mm256_add_ps(a, b); } template <> +<<<<<<< HEAD Vectorized inline operator-(const Vectorized& a, const Vectorized& b) { +======= +Vectorized inline operator-( + const Vectorized& a, + const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return _mm256_sub_ps(a, b); } template <> +<<<<<<< HEAD Vectorized inline operator*(const Vectorized& a, const Vectorized& b) { +======= +Vectorized inline operator*( + const Vectorized& a, + const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return _mm256_mul_ps(a, b); } template <> +<<<<<<< HEAD Vectorized inline operator/(const Vectorized& a, const Vectorized& b) { +======= +Vectorized inline operator/( + const Vectorized& a, + const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return _mm256_div_ps(a, b); } @@ -469,7 +623,13 @@ inline Vectorized Vectorized::frac() const { // Implements the IEEE 754 201X `maximum` operation, which propagates NaN if // either input is a NaN. template <> +<<<<<<< HEAD Vectorized inline maximum(const Vectorized& a, const Vectorized& b) { +======= +Vectorized inline maximum( + const Vectorized& a, + const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) Vectorized max = _mm256_max_ps(a, b); Vectorized isnan = _mm256_cmp_ps(a, b, _CMP_UNORD_Q); // Exploit the fact that all-ones is a NaN. @@ -479,7 +639,13 @@ Vectorized inline maximum(const Vectorized& a, const Vectorized +<<<<<<< HEAD Vectorized inline minimum(const Vectorized& a, const Vectorized& b) { +======= +Vectorized inline minimum( + const Vectorized& a, + const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) Vectorized min = _mm256_min_ps(a, b); Vectorized isnan = _mm256_cmp_ps(a, b, _CMP_UNORD_Q); // Exploit the fact that all-ones is a NaN. @@ -487,31 +653,63 @@ Vectorized inline minimum(const Vectorized& a, const Vectorized +<<<<<<< HEAD Vectorized inline clamp(const Vectorized& a, const Vectorized& min, const Vectorized& max) { +======= +Vectorized inline clamp( + const Vectorized& a, + const Vectorized& min, + const Vectorized& max) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return _mm256_min_ps(max, _mm256_max_ps(min, a)); } template <> +<<<<<<< HEAD Vectorized inline clamp_max(const Vectorized& a, const Vectorized& max) { +======= +Vectorized inline clamp_max( + const Vectorized& a, + const Vectorized& max) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return _mm256_min_ps(max, a); } template <> +<<<<<<< HEAD Vectorized inline clamp_min(const Vectorized& a, const Vectorized& min) { +======= +Vectorized inline clamp_min( + const Vectorized& a, + const Vectorized& min) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return _mm256_max_ps(min, a); } template <> +<<<<<<< HEAD Vectorized inline operator&(const Vectorized& a, const Vectorized& b) { +======= +Vectorized inline operator&( + const Vectorized& a, + const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return _mm256_and_ps(a, b); } template <> +<<<<<<< HEAD Vectorized inline operator|(const Vectorized& a, const Vectorized& b) { +======= +Vectorized inline operator|( + const Vectorized& a, + const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return _mm256_or_ps(a, b); } template <> +<<<<<<< HEAD Vectorized inline operator^(const Vectorized& a, const Vectorized& b) { return _mm256_xor_ps(a, b); } @@ -537,6 +735,41 @@ inline Vectorized Vectorized::lt(const Vectorized& other) c } inline Vectorized Vectorized::le(const Vectorized& other) const { +======= +Vectorized inline operator^( + const Vectorized& a, + const Vectorized& b) { + return _mm256_xor_ps(a, b); +} + +inline Vectorized Vectorized::eq( + const Vectorized& other) const { + return (*this == other) & Vectorized(1.0f); +} + +inline Vectorized Vectorized::ne( + const Vectorized& other) const { + return (*this != other) & Vectorized(1.0f); +} + +inline Vectorized Vectorized::gt( + const Vectorized& other) const { + return (*this > other) & Vectorized(1.0f); +} + +inline Vectorized Vectorized::ge( + const Vectorized& other) const { + return (*this >= other) & Vectorized(1.0f); +} + +inline Vectorized Vectorized::lt( + const Vectorized& other) const { + return (*this < other) & Vectorized(1.0f); +} + +inline Vectorized Vectorized::le( + const Vectorized& other) const { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return (*this <= other) & Vectorized(1.0f); } @@ -546,7 +779,12 @@ inline void convert(const float* src, float* dst, int64_t n) { #ifndef __msvc_cl__ #pragma unroll #endif +<<<<<<< HEAD for (i = 0; i <= (n - Vectorized::size()); i += Vectorized::size()) { +======= + for (i = 0; i <= (n - Vectorized::size()); + i += Vectorized::size()) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) _mm256_storeu_ps(dst + i, _mm256_loadu_ps(src + i)); } #ifndef __msvc_cl__ @@ -557,20 +795,39 @@ inline void convert(const float* src, float* dst, int64_t n) { } } +<<<<<<< HEAD template <> Vectorized inline fmadd(const Vectorized& a, const Vectorized& b, const Vectorized& c) { +======= +template <> +Vectorized inline fmadd( + const Vectorized& a, + const Vectorized& b, + const Vectorized& c) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return _mm256_fmadd_ps(a, b, c); } template <> +<<<<<<< HEAD Vectorized inline fmsub(const Vectorized& a, const Vectorized& b, const Vectorized& c) { +======= +Vectorized inline fmsub( + const Vectorized& a, + const Vectorized& b, + const Vectorized& c) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return _mm256_fmsub_ps(a, b, c); } // TODO: rewrite with ATEN vectorized (need to add unpack and shuffle) // Used by Inductor CPP codegen for micro gemm +<<<<<<< HEAD inline void transpose_block(at::vec::VectorizedN &input) { +======= +inline void transpose_block(at::vec::VectorizedN& input) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) __m256 temp0[8]; // unpacking and interleaving 32-bit elements // a0 b0 a1 b1 a4 b4 a5 b5 @@ -600,6 +857,7 @@ inline void transpose_block(at::vec::VectorizedN &input) { // e1 f1 g1 h1 ... // e2 f2 g2 h2 ... // e3 f3 g3 h3 ... +<<<<<<< HEAD temp1[0] = _mm256_castpd_ps( _mm256_unpacklo_pd(_mm256_castps_pd(temp0[0]), _mm256_castps_pd(temp0[2]))); temp1[1] = _mm256_castpd_ps( @@ -616,6 +874,24 @@ inline void transpose_block(at::vec::VectorizedN &input) { _mm256_unpacklo_pd(_mm256_castps_pd(temp0[5]), _mm256_castps_pd(temp0[7]))); temp1[7] = _mm256_castpd_ps( _mm256_unpackhi_pd(_mm256_castps_pd(temp0[5]), _mm256_castps_pd(temp0[7]))); +======= + temp1[0] = _mm256_castpd_ps(_mm256_unpacklo_pd( + _mm256_castps_pd(temp0[0]), _mm256_castps_pd(temp0[2]))); + temp1[1] = _mm256_castpd_ps(_mm256_unpackhi_pd( + _mm256_castps_pd(temp0[0]), _mm256_castps_pd(temp0[2]))); + temp1[2] = _mm256_castpd_ps(_mm256_unpacklo_pd( + _mm256_castps_pd(temp0[1]), _mm256_castps_pd(temp0[3]))); + temp1[3] = _mm256_castpd_ps(_mm256_unpackhi_pd( + _mm256_castps_pd(temp0[1]), _mm256_castps_pd(temp0[3]))); + temp1[4] = _mm256_castpd_ps(_mm256_unpacklo_pd( + _mm256_castps_pd(temp0[4]), _mm256_castps_pd(temp0[6]))); + temp1[5] = _mm256_castpd_ps(_mm256_unpackhi_pd( + _mm256_castps_pd(temp0[4]), _mm256_castps_pd(temp0[6]))); + temp1[6] = _mm256_castpd_ps(_mm256_unpacklo_pd( + _mm256_castps_pd(temp0[5]), _mm256_castps_pd(temp0[7]))); + temp1[7] = _mm256_castpd_ps(_mm256_unpackhi_pd( + _mm256_castps_pd(temp0[5]), _mm256_castps_pd(temp0[7]))); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // shuffle 128-bits (composed of 4 32-bit elements) // a0 b0 c0 d0 e0 f0 g0 h0 @@ -637,7 +913,11 @@ inline void transpose_block(at::vec::VectorizedN &input) { } // Used by Inductor CPP codegen +<<<<<<< HEAD template<> +======= +template <> +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) inline void transpose_mxn( const float* src, int64_t ld_src, @@ -672,12 +952,17 @@ inline void transpose_mxn( } } +<<<<<<< HEAD template<> +======= +template <> +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) inline void transpose_mxn( const float* src, int64_t ld_src, float* dst, int64_t ld_dst) { +<<<<<<< HEAD transpose_mxn( src , ld_src, dst, ld_dst); transpose_mxn( @@ -690,3 +975,15 @@ inline void transpose_mxn( #endif }} // namespace at::vec::CPU_CAPABILITY +======= + transpose_mxn(src, ld_src, dst, ld_dst); + transpose_mxn(src + 8, ld_src, dst + 8 * ld_dst, ld_dst); + transpose_mxn(src + 8 * ld_src, ld_src, dst + 8, ld_dst); + transpose_mxn( + src + 8 * ld_src + 8, ld_src, dst + 8 * ld_dst + 8, ld_dst); +} +#endif + +} // namespace CPU_CAPABILITY +} // namespace at::vec +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_half.h b/aten/src/ATen/cpu/vec/vec256/vec256_half.h index b27f33c84323..7d08c61e10c4 100644 --- a/aten/src/ATen/cpu/vec/vec256/vec256_half.h +++ b/aten/src/ATen/cpu/vec/vec256/vec256_half.h @@ -13,8 +13,16 @@ inline namespace CPU_CAPABILITY { #ifdef CPU_CAPABILITY_AVX2 template <> +<<<<<<< HEAD class Vectorized: public Vectorized16 { public: +======= +struct is_vec_specialized_for : std::bool_constant {}; + +template <> +class Vectorized : public Vectorized16 { + public: +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) using Vectorized16::Vectorized16; using value_type = Half; @@ -29,6 +37,7 @@ class Vectorized: public Vectorized16 { Vectorized le(const Vectorized& other) const; }; +<<<<<<< HEAD Vectorized inline operator+(const Vectorized& a, const Vectorized& b) { return binary_op_as_fp32(a, b, [](const __m256& x, const __m256& y) { return _mm256_add_ps(x, y); }); } @@ -67,6 +76,74 @@ inline Vectorized Vectorized::lt(const Vectorized& other) cons return (*this < other) & Vectorized(1.0f); } inline Vectorized Vectorized::le(const Vectorized& other) const { +======= +Vectorized inline operator+( + const Vectorized& a, + const Vectorized& b) { + return binary_op_as_fp32(a, b, [](const __m256& x, const __m256& y) { + return _mm256_add_ps(x, y); + }); +} +Vectorized inline operator-( + const Vectorized& a, + const Vectorized& b) { + return binary_op_as_fp32(a, b, [](const __m256& x, const __m256& y) { + return _mm256_sub_ps(x, y); + }); +} +Vectorized inline operator*( + const Vectorized& a, + const Vectorized& b) { + return binary_op_as_fp32(a, b, [](const __m256& x, const __m256& y) { + return _mm256_mul_ps(x, y); + }); +} +Vectorized inline operator/( + const Vectorized& a, + const Vectorized& b) { + return binary_op_as_fp32(a, b, [](const __m256& x, const __m256& y) { + return _mm256_div_ps(x, y); + }); +} +Vectorized inline operator&( + const Vectorized& a, + const Vectorized& b) { + return _mm256_and_si256(a, b); +} +Vectorized inline operator|( + const Vectorized& a, + const Vectorized& b) { + return _mm256_or_si256(a, b); +} +Vectorized inline operator^( + const Vectorized& a, + const Vectorized& b) { + return _mm256_xor_si256(a, b); +} + +inline Vectorized Vectorized::eq( + const Vectorized& other) const { + return (*this == other) & Vectorized(1.0f); +} +inline Vectorized Vectorized::ne( + const Vectorized& other) const { + return (*this != other) & Vectorized(1.0f); +} +inline Vectorized Vectorized::gt( + const Vectorized& other) const { + return (*this > other) & Vectorized(1.0f); +} +inline Vectorized Vectorized::ge( + const Vectorized& other) const { + return (*this >= other) & Vectorized(1.0f); +} +inline Vectorized Vectorized::lt( + const Vectorized& other) const { + return (*this < other) & Vectorized(1.0f); +} +inline Vectorized Vectorized::le( + const Vectorized& other) const { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return (*this <= other) & Vectorized(1.0f); } @@ -78,7 +155,13 @@ inline Vectorized Vectorized::frac() const { // Implements the IEEE 754 201X `maximum` operation, which propagates NaN if // either input is a NaN. template <> +<<<<<<< HEAD Vectorized inline maximum(const Vectorized& a, const Vectorized& b) { +======= +Vectorized inline maximum( + const Vectorized& a, + const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) __m256 a_lo, a_hi; __m256 b_lo, b_hi; cvtfp16_fp32(__m256i(a), a_lo, a_hi); @@ -96,7 +179,13 @@ Vectorized inline maximum(const Vectorized& a, const Vectorized +<<<<<<< HEAD Vectorized inline minimum(const Vectorized& a, const Vectorized& b) { +======= +Vectorized inline minimum( + const Vectorized& a, + const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) __m256 a_lo, a_hi; __m256 b_lo, b_hi; cvtfp16_fp32(__m256i(a), a_lo, a_hi); @@ -112,8 +201,15 @@ Vectorized inline minimum(const Vectorized& a, const Vectorized +<<<<<<< HEAD Vectorized inline clamp(const Vectorized& a, const Vectorized& min, const Vectorized& max) { +======= +Vectorized inline clamp( + const Vectorized& a, + const Vectorized& min, + const Vectorized& max) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) __m256 a_lo, a_hi; __m256 min_lo, min_hi; __m256 max_lo, max_hi; @@ -126,7 +222,13 @@ Vectorized inline clamp(const Vectorized& a, } template <> +<<<<<<< HEAD Vectorized inline clamp_max(const Vectorized& a, const Vectorized& max) { +======= +Vectorized inline clamp_max( + const Vectorized& a, + const Vectorized& max) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) __m256 a_lo, a_hi; __m256 max_lo, max_hi; cvtfp16_fp32(__m256i(a), a_lo, a_hi); @@ -137,7 +239,13 @@ Vectorized inline clamp_max(const Vectorized& a, const Vectorized +<<<<<<< HEAD Vectorized inline clamp_min(const Vectorized& a, const Vectorized& min) { +======= +Vectorized inline clamp_min( + const Vectorized& a, + const Vectorized& min) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) __m256 a_lo, a_hi; __m256 min_lo, min_hi; cvtfp16_fp32(__m256i(a), a_lo, a_hi); @@ -153,8 +261,15 @@ inline void convert(const Half* src, Half* dst, int64_t n) { #ifndef __msvc_cl__ #pragma unroll #endif +<<<<<<< HEAD for (i = 0; i <= (n - Vectorized::size()); i += Vectorized::size()) { auto vsrc = _mm256_loadu_si256(reinterpret_cast<__m256i*>((void*)(src + i))); +======= + for (i = 0; i <= (n - Vectorized::size()); + i += Vectorized::size()) { + auto vsrc = + _mm256_loadu_si256(reinterpret_cast<__m256i*>((void*)(src + i))); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) _mm256_storeu_si256(reinterpret_cast<__m256i*>((void*)(dst + i)), vsrc); } #ifndef __msvc_cl__ @@ -168,7 +283,12 @@ inline void convert(const Half* src, Half* dst, int64_t n) { template <> inline void convert(const float* src, Half* dst, int64_t n) { int64_t i; +<<<<<<< HEAD for (i = 0; i + Vectorized::size() <= n; i += Vectorized::size()) { +======= + for (i = 0; i + Vectorized::size() <= n; + i += Vectorized::size()) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) __m256 a = _mm256_loadu_ps(&src[i]); __m256 b = _mm256_loadu_ps(&src[i + 8]); @@ -182,7 +302,11 @@ inline void convert(const float* src, Half* dst, int64_t n) { template <> inline void convert(const double* src, Half* dst, int64_t n) { +<<<<<<< HEAD auto load_float = [](const double *src) -> __m256 { +======= + auto load_float = [](const double* src) -> __m256 { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // Load one float vector from an array of doubles __m128 a = _mm256_cvtpd_ps(_mm256_loadu_pd(src)); __m128 b = _mm256_cvtpd_ps(_mm256_loadu_pd(src + 4)); @@ -190,7 +314,12 @@ inline void convert(const double* src, Half* dst, int64_t n) { }; int64_t i; +<<<<<<< HEAD for (i = 0; i + Vectorized::size() <= n; i += Vectorized::size()) { +======= + for (i = 0; i + Vectorized::size() <= n; + i += Vectorized::size()) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) __m256 a = load_float(&src[i]); __m256 b = load_float(&src[i + 8]); @@ -203,8 +332,15 @@ inline void convert(const double* src, Half* dst, int64_t n) { } template <> +<<<<<<< HEAD Vectorized inline fmadd(const Vectorized& a, const Vectorized& b, const Vectorized& c) { +======= +Vectorized inline fmadd( + const Vectorized& a, + const Vectorized& b, + const Vectorized& c) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) __m256 a_lo, a_hi; __m256 b_lo, b_hi; __m256 c_lo, c_hi; @@ -221,10 +357,21 @@ LOAD_FP32_VECTORIZED_INIT(Half, fp16) #else // defined(CPU_CAPABILITY_AVX2) +<<<<<<< HEAD #if !(defined(__aarch64__) && !defined(C10_MOBILE) && !defined(__CUDACC__) && !defined(CPU_CAPABILITY_SVE256)) +======= +#if !( \ + defined(__aarch64__) && !defined(C10_MOBILE) && !defined(__CUDACC__) && \ + !defined(CPU_CAPABILITY_SVE256)) +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) CONVERT_NON_VECTORIZED_INIT(Half, half) #endif LOAD_FP32_NON_VECTORIZED_INIT(Half, fp16) #endif // defined(CPU_CAPABILITY_AVX2) +<<<<<<< HEAD }} // namsepace at::vec::CPU_CAPABILITY +======= +} // namespace CPU_CAPABILITY +} // namespace at::vec +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_int.h b/aten/src/ATen/cpu/vec/vec256/vec256_int.h index 03929eecfed3..0d80d92f081b 100644 --- a/aten/src/ATen/cpu/vec/vec256/vec256_int.h +++ b/aten/src/ATen/cpu/vec/vec256/vec256_int.h @@ -14,14 +14,23 @@ inline namespace CPU_CAPABILITY { #ifdef CPU_CAPABILITY_AVX2 struct Vectorizedi { +<<<<<<< HEAD protected: +======= + protected: +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) __m256i values; static inline __m256i invert(const __m256i& v) { const auto ones = _mm256_set1_epi64x(-1); return _mm256_xor_si256(ones, v); } +<<<<<<< HEAD public: +======= + + public: +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) Vectorizedi() {} Vectorizedi(__m256i v) : values(v) {} operator __m256i() const { @@ -31,17 +40,32 @@ struct Vectorizedi { #else +<<<<<<< HEAD struct Vectorizedi {}; // dummy definition to make Vectorizedi always defined +======= +struct Vectorizedi {}; // dummy definition to make Vectorizedi always defined +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #endif // CPU_CAPABILITY_AVX2 #ifdef CPU_CAPABILITY_AVX2 template <> +<<<<<<< HEAD class Vectorized : public Vectorizedi { private: static const Vectorized ones; public: +======= +struct is_vec_specialized_for : std::bool_constant {}; + +template <> +class Vectorized : public Vectorizedi { + private: + static const Vectorized ones; + + public: +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) using value_type = int64_t; using size_type = int; static constexpr size_type size() { @@ -49,12 +73,24 @@ class Vectorized : public Vectorizedi { } using Vectorizedi::Vectorizedi; Vectorized() {} +<<<<<<< HEAD Vectorized(int64_t v) { values = _mm256_set1_epi64x(v); } +======= + Vectorized(int64_t v) { + values = _mm256_set1_epi64x(v); + } +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) Vectorized(int64_t val1, int64_t val2, int64_t val3, int64_t val4) { values = _mm256_setr_epi64x(val1, val2, val3, val4); } template +<<<<<<< HEAD static Vectorized blend(Vectorized a, Vectorized b) { +======= + static Vectorized blend( + Vectorized a, + Vectorized b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) __at_align__ int64_t tmp_values[size()]; a.store(tmp_values); if (mask & 0x01) @@ -67,6 +103,7 @@ class Vectorized : public Vectorizedi { tmp_values[3] = _mm256_extract_epi64(b.values, 3); return loadu(tmp_values); } +<<<<<<< HEAD static Vectorized blendv(const Vectorized& a, const Vectorized& b, const Vectorized& mask) { return _mm256_blendv_epi8(a.values, b.values, mask.values); @@ -77,6 +114,25 @@ class Vectorized : public Vectorizedi { } static Vectorized set(Vectorized a, Vectorized b, int64_t count = size()) { +======= + static Vectorized blendv( + const Vectorized& a, + const Vectorized& b, + const Vectorized& mask) { + return _mm256_blendv_epi8(a.values, b.values, mask.values); + } + template + static Vectorized arange( + int64_t base = 0, + step_t step = static_cast(1)) { + return Vectorized( + base, base + step, base + 2 * step, base + 3 * step); + } + static Vectorized set( + Vectorized a, + Vectorized b, + int64_t count = size()) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) switch (count) { case 0: return a; @@ -94,9 +150,16 @@ class Vectorized : public Vectorizedi { } static Vectorized loadu(const void* ptr, int64_t count) { __at_align__ int64_t tmp_values[size()]; +<<<<<<< HEAD // Ensure uninitialized memory does not change the output value See https://github.com/pytorch/pytorch/issues/32502 // for more details. We do not initialize arrays to zero using "={0}" because gcc would compile it to two // instructions while a loop would be compiled to one instruction. +======= + // Ensure uninitialized memory does not change the output value See + // https://github.com/pytorch/pytorch/issues/32502 for more details. We do + // not initialize arrays to zero using "={0}" because gcc would compile it + // to two instructions while a loop would be compiled to one instruction. +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) for (const auto i : c10::irange(size())) { tmp_values[i] = 0; } @@ -114,8 +177,13 @@ class Vectorized : public Vectorizedi { std::memcpy(ptr, tmp_values, count * sizeof(int64_t)); } } +<<<<<<< HEAD const int64_t& operator[](int idx) const = delete; int64_t& operator[](int idx) = delete; +======= + const int64_t& operator[](int idx) const = delete; + int64_t& operator[](int idx) = delete; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) Vectorized abs() const { auto zero = _mm256_set1_epi64x(0); auto is_larger = _mm256_cmpgt_epi64(zero, values); @@ -160,16 +228,28 @@ class Vectorized : public Vectorizedi { }; template <> +<<<<<<< HEAD class Vectorized : public Vectorizedi { private: static const Vectorized ones; public: +======= +struct is_vec_specialized_for : std::bool_constant {}; + +template <> +class Vectorized : public Vectorizedi { + private: + static const Vectorized ones; + + public: +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) using value_type = int32_t; static constexpr int size() { return 8; } using Vectorizedi::Vectorizedi; Vectorized() {} +<<<<<<< HEAD Vectorized(int32_t v) { values = _mm256_set1_epi32(v); } Vectorized(int32_t val1, int32_t val2, int32_t val3, int32_t val4, int32_t val5, int32_t val6, int32_t val7, int32_t val8) { @@ -191,6 +271,52 @@ class Vectorized : public Vectorizedi { } static Vectorized set(Vectorized a, Vectorized b, int32_t count = size()) { +======= + Vectorized(int32_t v) { + values = _mm256_set1_epi32(v); + } + Vectorized( + int32_t val1, + int32_t val2, + int32_t val3, + int32_t val4, + int32_t val5, + int32_t val6, + int32_t val7, + int32_t val8) { + values = _mm256_setr_epi32(val1, val2, val3, val4, val5, val6, val7, val8); + } + template + static Vectorized blend( + Vectorized a, + Vectorized b) { + return _mm256_blend_epi32(a, b, mask); + } + static Vectorized blendv( + const Vectorized& a, + const Vectorized& b, + const Vectorized& mask) { + return _mm256_blendv_epi8(a.values, b.values, mask.values); + } + template + static Vectorized arange( + int32_t base = 0, + step_t step = static_cast(1)) { + return Vectorized( + base, + base + step, + base + 2 * step, + base + 3 * step, + base + 4 * step, + base + 5 * step, + base + 6 * step, + base + 7 * step); + } + static Vectorized set( + Vectorized a, + Vectorized b, + int32_t count = size()) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) switch (count) { case 0: return a; @@ -216,9 +342,16 @@ class Vectorized : public Vectorizedi { } static Vectorized loadu(const void* ptr, int32_t count) { __at_align__ int32_t tmp_values[size()]; +<<<<<<< HEAD // Ensure uninitialized memory does not change the output value See https://github.com/pytorch/pytorch/issues/32502 // for more details. We do not initialize arrays to zero using "={0}" because gcc would compile it to two // instructions while a loop would be compiled to one instruction. +======= + // Ensure uninitialized memory does not change the output value See + // https://github.com/pytorch/pytorch/issues/32502 for more details. We do + // not initialize arrays to zero using "={0}" because gcc would compile it + // to two instructions while a loop would be compiled to one instruction. +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) for (const auto i : c10::irange(size())) { tmp_values[i] = 0; } @@ -236,8 +369,13 @@ class Vectorized : public Vectorizedi { std::memcpy(ptr, tmp_values, count * sizeof(int32_t)); } } +<<<<<<< HEAD const int32_t& operator[](int idx) const = delete; int32_t& operator[](int idx) = delete; +======= + const int32_t& operator[](int idx) const = delete; + int32_t& operator[](int idx) = delete; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) Vectorized abs() const { return _mm256_abs_epi32(values); } @@ -306,6 +444,7 @@ class Vectorized : public Vectorizedi { }; template <> +<<<<<<< HEAD inline void convert(const int32_t *src, float *dst, int64_t n) { int64_t i; // int32_t and float have same size @@ -314,11 +453,27 @@ inline void convert(const int32_t *src, float *dst, int64_t n) { #endif for (i = 0; i <= (n - Vectorized::size()); i += Vectorized::size()) { auto input_vec = _mm256_loadu_si256(reinterpret_cast(src + i)); +======= +inline void convert(const int32_t* src, float* dst, int64_t n) { + int64_t i; + // int32_t and float have same size +#ifndef _MSC_VER +#pragma unroll +#endif + for (i = 0; i <= (n - Vectorized::size()); + i += Vectorized::size()) { + auto input_vec = + _mm256_loadu_si256(reinterpret_cast(src + i)); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) auto output_vec = _mm256_cvtepi32_ps(input_vec); _mm256_storeu_ps(reinterpret_cast(dst + i), output_vec); } #ifndef _MSC_VER +<<<<<<< HEAD # pragma unroll +======= +#pragma unroll +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #endif for (; i < n; i++) { dst[i] = static_cast(src[i]); @@ -326,6 +481,7 @@ inline void convert(const int32_t *src, float *dst, int64_t n) { } template <> +<<<<<<< HEAD inline void convert(const int32_t *src, double *dst, int64_t n) { int64_t i; // int32_t has half the size of double @@ -334,11 +490,27 @@ inline void convert(const int32_t *src, double *dst, int64_t n) { #endif for (i = 0; i <= (n - Vectorized::size()); i += Vectorized::size()) { auto input_128_vec = _mm_loadu_si128(reinterpret_cast(src + i)); +======= +inline void convert(const int32_t* src, double* dst, int64_t n) { + int64_t i; + // int32_t has half the size of double +#ifndef _MSC_VER +#pragma unroll +#endif + for (i = 0; i <= (n - Vectorized::size()); + i += Vectorized::size()) { + auto input_128_vec = + _mm_loadu_si128(reinterpret_cast(src + i)); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) auto output_vec = _mm256_cvtepi32_pd(input_128_vec); _mm256_storeu_pd(reinterpret_cast(dst + i), output_vec); } #ifndef _MSC_VER +<<<<<<< HEAD # pragma unroll +======= +#pragma unroll +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #endif for (; i < n; i++) { dst[i] = static_cast(src[i]); @@ -346,16 +518,28 @@ inline void convert(const int32_t *src, double *dst, int64_t n) { } template <> +<<<<<<< HEAD class Vectorized : public Vectorizedi { private: static const Vectorized ones; public: +======= +struct is_vec_specialized_for : std::bool_constant {}; + +template <> +class Vectorized : public Vectorizedi { + private: + static const Vectorized ones; + + public: +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) using value_type = int16_t; static constexpr int size() { return 16; } using Vectorizedi::Vectorizedi; Vectorized() {} +<<<<<<< HEAD Vectorized(int16_t v) { values = _mm256_set1_epi16(v); } Vectorized(int16_t val1, int16_t val2, int16_t val3, int16_t val4, int16_t val5, int16_t val6, int16_t val7, int16_t val8, @@ -366,6 +550,50 @@ class Vectorized : public Vectorizedi { } template static Vectorized blend(Vectorized a, Vectorized b) { +======= + Vectorized(int16_t v) { + values = _mm256_set1_epi16(v); + } + Vectorized( + int16_t val1, + int16_t val2, + int16_t val3, + int16_t val4, + int16_t val5, + int16_t val6, + int16_t val7, + int16_t val8, + int16_t val9, + int16_t val10, + int16_t val11, + int16_t val12, + int16_t val13, + int16_t val14, + int16_t val15, + int16_t val16) { + values = _mm256_setr_epi16( + val1, + val2, + val3, + val4, + val5, + val6, + val7, + val8, + val9, + val10, + val11, + val12, + val13, + val14, + val15, + val16); + } + template + static Vectorized blend( + Vectorized a, + Vectorized b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) __at_align__ int16_t tmp_values[size()]; a.store(tmp_values); if (mask & 0x01) @@ -402,6 +630,7 @@ class Vectorized : public Vectorizedi { tmp_values[15] = _mm256_extract_epi16(b.values, 15); return loadu(tmp_values); } +<<<<<<< HEAD static Vectorized blendv(const Vectorized& a, const Vectorized& b, const Vectorized& mask) { return _mm256_blendv_epi8(a.values, b.values, mask.values); @@ -416,6 +645,40 @@ class Vectorized : public Vectorizedi { } static Vectorized set(Vectorized a, Vectorized b, int16_t count = size()) { +======= + static Vectorized blendv( + const Vectorized& a, + const Vectorized& b, + const Vectorized& mask) { + return _mm256_blendv_epi8(a.values, b.values, mask.values); + } + template + static Vectorized arange( + int16_t base = 0, + step_t step = static_cast(1)) { + return Vectorized( + base, + base + step, + base + 2 * step, + base + 3 * step, + base + 4 * step, + base + 5 * step, + base + 6 * step, + base + 7 * step, + base + 8 * step, + base + 9 * step, + base + 10 * step, + base + 11 * step, + base + 12 * step, + base + 13 * step, + base + 14 * step, + base + 15 * step); + } + static Vectorized set( + Vectorized a, + Vectorized b, + int16_t count = size()) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) switch (count) { case 0: return a; @@ -457,9 +720,16 @@ class Vectorized : public Vectorizedi { } static Vectorized loadu(const void* ptr, int16_t count) { __at_align__ int16_t tmp_values[size()]; +<<<<<<< HEAD // Ensure uninitialized memory does not change the output value See https://github.com/pytorch/pytorch/issues/32502 // for more details. We do not initialize arrays to zero using "={0}" because gcc would compile it to two // instructions while a loop would be compiled to one instruction. +======= + // Ensure uninitialized memory does not change the output value See + // https://github.com/pytorch/pytorch/issues/32502 for more details. We do + // not initialize arrays to zero using "={0}" because gcc would compile it + // to two instructions while a loop would be compiled to one instruction. +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) for (const auto i : c10::irange(size())) { tmp_values[i] = 0; } @@ -477,8 +747,13 @@ class Vectorized : public Vectorizedi { std::memcpy(ptr, tmp_values, count * sizeof(int16_t)); } } +<<<<<<< HEAD const int16_t& operator[](int idx) const = delete; int16_t& operator[](int idx) = delete; +======= + const int16_t& operator[](int idx) const = delete; + int16_t& operator[](int idx) = delete; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) Vectorized abs() const { return _mm256_abs_epi16(values); } @@ -522,17 +797,28 @@ class Vectorized : public Vectorizedi { template class Vectorized8 : public Vectorizedi { static_assert( +<<<<<<< HEAD std::is_same_v || std::is_same_v, "Only int8_t/uint8_t are supported"); protected: static const Vectorized ones; public: +======= + std::is_same_v || std::is_same_v, + "Only int8_t/uint8_t are supported"); + + protected: + static const Vectorized ones; + + public: +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) using value_type = T; static constexpr int size() { return 32; } using Vectorizedi::Vectorizedi; Vectorized8() {} +<<<<<<< HEAD Vectorized8(T v) { values = _mm256_set1_epi8(v); } Vectorized8(T val1, T val2, T val3, T val4, T val5, T val6, T val7, T val8, @@ -546,6 +832,77 @@ class Vectorized8 : public Vectorizedi { val9, val10, val11, val12, val13, val14, val15, val16, val17, val18, val19, val20, val21, val22, val23, val24, val25, val26, val27, val28, val29, val30, val31, val32); +======= + Vectorized8(T v) { + values = _mm256_set1_epi8(v); + } + Vectorized8( + T val1, + T val2, + T val3, + T val4, + T val5, + T val6, + T val7, + T val8, + T val9, + T val10, + T val11, + T val12, + T val13, + T val14, + T val15, + T val16, + T val17, + T val18, + T val19, + T val20, + T val21, + T val22, + T val23, + T val24, + T val25, + T val26, + T val27, + T val28, + T val29, + T val30, + T val31, + T val32) { + values = _mm256_setr_epi8( + val1, + val2, + val3, + val4, + val5, + val6, + val7, + val8, + val9, + val10, + val11, + val12, + val13, + val14, + val15, + val16, + val17, + val18, + val19, + val20, + val21, + val22, + val23, + val24, + val25, + val26, + val27, + val28, + val29, + val30, + val31, + val32); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } template static Vectorized blend(Vectorized a, Vectorized b) { @@ -617,6 +974,7 @@ class Vectorized8 : public Vectorizedi { tmp_values[31] = _mm256_extract_epi8(b.values, 31); return loadu(tmp_values); } +<<<<<<< HEAD static Vectorized blendv(const Vectorized& a, const Vectorized& b, const Vectorized& mask) { return _mm256_blendv_epi8(a.values, b.values, mask.values); @@ -635,6 +993,53 @@ class Vectorized8 : public Vectorizedi { } static Vectorized set(Vectorized a, Vectorized b, T count = size()) { +======= + static Vectorized blendv( + const Vectorized& a, + const Vectorized& b, + const Vectorized& mask) { + return _mm256_blendv_epi8(a.values, b.values, mask.values); + } + template + static Vectorized arange( + T base = 0, + step_t step = static_cast(1)) { + return Vectorized( + base, + base + step, + base + 2 * step, + base + 3 * step, + base + 4 * step, + base + 5 * step, + base + 6 * step, + base + 7 * step, + base + 8 * step, + base + 9 * step, + base + 10 * step, + base + 11 * step, + base + 12 * step, + base + 13 * step, + base + 14 * step, + base + 15 * step, + base + 16 * step, + base + 17 * step, + base + 18 * step, + base + 19 * step, + base + 20 * step, + base + 21 * step, + base + 22 * step, + base + 23 * step, + base + 24 * step, + base + 25 * step, + base + 26 * step, + base + 27 * step, + base + 28 * step, + base + 29 * step, + base + 30 * step, + base + 31 * step); + } + static Vectorized set(Vectorized a, Vectorized b, T count = size()) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) switch (count) { case 0: return a; @@ -707,6 +1112,7 @@ class Vectorized8 : public Vectorizedi { return _mm256_loadu_si256(reinterpret_cast(ptr)); } static Vectorized loadu_one_fourth(const void* ptr) { +<<<<<<< HEAD // Fast path if only load element number of 8. // Note: We didn't merge it as fast path of loadu(const void* ptr, T count), // Because loadu(const void* ptr, T count) requires zero initialization for upper 128 bits. @@ -721,6 +1127,24 @@ class Vectorized8 : public Vectorizedi { // Ensure uninitialized memory does not change the output value See https://github.com/pytorch/pytorch/issues/32502 // for more details. We do not initialize arrays to zero using "={0}" because gcc would compile it to two // instructions while a loop would be compiled to one instruction. +======= + // Fast path if only load element number of 8. + // Note: We didn't merge it as fast path of loadu(const void* ptr, T count), + // Because loadu(const void* ptr, T count) requires zero initialization for + // upper 128 bits. However, by using _mm256_castsi128_si256, the upper 128 + // bits of the result are undefined. + // TODO We can use _mm256_zextsi128_si256 in the furture, + // since gcc 9.3 doesn't support it now. + __m128i input_128 = _mm_loadl_epi64(reinterpret_cast(ptr)); + return _mm256_castsi128_si256(input_128); + } + static Vectorized loadu(const void* ptr, T count) { + __at_align__ T tmp_values[size()]; + // Ensure uninitialized memory does not change the output value See + // https://github.com/pytorch/pytorch/issues/32502 for more details. We do + // not initialize arrays to zero using "={0}" because gcc would compile it + // to two instructions while a loop would be compiled to one instruction. +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) for (const auto i : c10::irange(size())) { tmp_values[i] = 0; } @@ -735,7 +1159,12 @@ class Vectorized8 : public Vectorizedi { } else if (count > 0) { if (count == 8) { // Fast path if only store element number of 8 +<<<<<<< HEAD _mm_storel_epi64(reinterpret_cast<__m128i*>(ptr), _mm256_castsi256_si128(values)); +======= + _mm_storel_epi64( + reinterpret_cast<__m128i*>(ptr), _mm256_castsi256_si128(values)); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } else { __at_align__ T tmp_values[size()]; _mm256_storeu_si256(reinterpret_cast<__m256i*>(tmp_values), values); @@ -743,8 +1172,13 @@ class Vectorized8 : public Vectorizedi { } } } +<<<<<<< HEAD const T& operator[](int idx) const = delete; T& operator[](int idx) = delete; +======= + const T& operator[](int idx) const = delete; + T& operator[](int idx) = delete; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) Vectorized real() const { return *this; } @@ -756,15 +1190,28 @@ class Vectorized8 : public Vectorizedi { } }; +<<<<<<< HEAD template<> class Vectorized: public Vectorized8 { public: +======= +template <> +struct is_vec_specialized_for : std::bool_constant {}; + +template <> +class Vectorized : public Vectorized8 { + public: +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) using Vectorized8::Vectorized8; Vectorized neg() const; Vectorized abs() const { +<<<<<<< HEAD return _mm256_abs_epi8(values); +======= + return _mm256_abs_epi8(values); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } Vectorized operator==(const Vectorized& other) const { @@ -794,9 +1241,18 @@ class Vectorized: public Vectorized8 { Vectorized le(const Vectorized& other) const; }; +<<<<<<< HEAD template<> class Vectorized: public Vectorized8 { public: +======= +template <> +struct is_vec_specialized_for : std::bool_constant {}; + +template <> +class Vectorized : public Vectorized8 { + public: +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) using Vectorized8::Vectorized8; Vectorized neg() const; @@ -835,52 +1291,112 @@ class Vectorized: public Vectorized8 { }; template <> +<<<<<<< HEAD Vectorized inline operator+(const Vectorized& a, const Vectorized& b) { +======= +Vectorized inline operator+( + const Vectorized& a, + const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return _mm256_add_epi64(a, b); } template <> +<<<<<<< HEAD Vectorized inline operator+(const Vectorized& a, const Vectorized& b) { +======= +Vectorized inline operator+( + const Vectorized& a, + const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return _mm256_add_epi32(a, b); } template <> +<<<<<<< HEAD Vectorized inline operator+(const Vectorized& a, const Vectorized& b) { +======= +Vectorized inline operator+( + const Vectorized& a, + const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return _mm256_add_epi16(a, b); } template <> +<<<<<<< HEAD Vectorized inline operator+(const Vectorized& a, const Vectorized& b) { +======= +Vectorized inline operator+( + const Vectorized& a, + const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return _mm256_add_epi8(a, b); } template <> +<<<<<<< HEAD Vectorized inline operator+(const Vectorized& a, const Vectorized& b) { +======= +Vectorized inline operator+( + const Vectorized& a, + const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return _mm256_add_epi8(a, b); } template <> +<<<<<<< HEAD Vectorized inline operator-(const Vectorized& a, const Vectorized& b) { +======= +Vectorized inline operator-( + const Vectorized& a, + const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return _mm256_sub_epi64(a, b); } template <> +<<<<<<< HEAD Vectorized inline operator-(const Vectorized& a, const Vectorized& b) { +======= +Vectorized inline operator-( + const Vectorized& a, + const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return _mm256_sub_epi32(a, b); } template <> +<<<<<<< HEAD Vectorized inline operator-(const Vectorized& a, const Vectorized& b) { +======= +Vectorized inline operator-( + const Vectorized& a, + const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return _mm256_sub_epi16(a, b); } template <> +<<<<<<< HEAD Vectorized inline operator-(const Vectorized& a, const Vectorized& b) { +======= +Vectorized inline operator-( + const Vectorized& a, + const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return _mm256_sub_epi8(a, b); } template <> +<<<<<<< HEAD Vectorized inline operator-(const Vectorized& a, const Vectorized& b) { +======= +Vectorized inline operator-( + const Vectorized& a, + const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return _mm256_sub_epi8(a, b); } @@ -909,7 +1425,14 @@ inline Vectorized Vectorized::neg() const { // by extracting each element, performing the operation pointwise, // then combining the results into a vector. template +<<<<<<< HEAD Vectorized inline emulate(const Vectorized& a, const Vectorized& b, const op_t& op) { +======= +Vectorized inline emulate( + const Vectorized& a, + const Vectorized& b, + const op_t& op) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) int64_t a0 = _mm256_extract_epi64(a, 0); int64_t a1 = _mm256_extract_epi64(a, 1); int64_t a2 = _mm256_extract_epi64(a, 2); @@ -929,7 +1452,15 @@ Vectorized inline emulate(const Vectorized& a, const Vectorize } template +<<<<<<< HEAD Vectorized inline emulate(const Vectorized& a, const Vectorized& b, const Vectorized& c, const op_t& op) { +======= +Vectorized inline emulate( + const Vectorized& a, + const Vectorized& b, + const Vectorized& c, + const op_t& op) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) int64_t a0 = _mm256_extract_epi64(a, 0); int64_t a1 = _mm256_extract_epi64(a, 1); int64_t a2 = _mm256_extract_epi64(a, 2); @@ -959,22 +1490,51 @@ Vectorized inline emulate(const Vectorized& a, const Vectorize // code for add as well. // Note: intentionally ignores undefined behavior like (-lowest * -1). template <> +<<<<<<< HEAD Vectorized inline operator*(const Vectorized& a, const Vectorized& b) { return emulate(a, b, [](int64_t a_point, int64_t b_point) __ubsan_ignore_undefined__ {return a_point * b_point;}); } template <> Vectorized inline operator*(const Vectorized& a, const Vectorized& b) { +======= +Vectorized inline operator*( + const Vectorized& a, + const Vectorized& b) { + return emulate( + a, b, [](int64_t a_point, int64_t b_point) __ubsan_ignore_undefined__ { + return a_point * b_point; + }); +} + +template <> +Vectorized inline operator*( + const Vectorized& a, + const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return _mm256_mullo_epi32(a, b); } template <> +<<<<<<< HEAD Vectorized inline operator*(const Vectorized& a, const Vectorized& b) { +======= +Vectorized inline operator*( + const Vectorized& a, + const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return _mm256_mullo_epi16(a, b); } template +<<<<<<< HEAD Vectorized inline int_elementwise_binary_256(const Vectorized& a, const Vectorized& b, Op op) { +======= +Vectorized inline int_elementwise_binary_256( + const Vectorized& a, + const Vectorized& b, + Op op) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) T values_a[Vectorized::size()]; T values_b[Vectorized::size()]; a.store(values_a); @@ -986,7 +1546,13 @@ Vectorized inline int_elementwise_binary_256(const Vectorized& a, const Ve } template <> +<<<<<<< HEAD Vectorized inline operator*(const Vectorized& a, const Vectorized& b) { +======= +Vectorized inline operator*( + const Vectorized& a, + const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // We don't have an instruction for multiplying int8_t #ifndef CPU_CAPABILITY_AVX2 return int_elementwise_binary_256(a, b, std::multiplies()); @@ -1004,14 +1570,25 @@ Vectorized inline operator*(const Vectorized& a, const Vectorize } template <> +<<<<<<< HEAD Vectorized inline operator*(const Vectorized& a, const Vectorized& b) { +======= +Vectorized inline operator*( + const Vectorized& a, + const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // We don't have an instruction for multiplying uint8_t #ifndef CPU_CAPABILITY_AVX2 return int_elementwise_binary_256(a, b, std::multiplies()); #else __m256i mask00FF = _mm256_set1_epi16(0x00FF); +<<<<<<< HEAD __m256i a_lo = _mm256_and_si256 (a, mask00FF); __m256i b_lo = _mm256_and_si256 (b, mask00FF); +======= + __m256i a_lo = _mm256_and_si256(a, mask00FF); + __m256i b_lo = _mm256_and_si256(b, mask00FF); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) __m256i a_hi = _mm256_srli_epi16(a, 8); __m256i b_hi = _mm256_srli_epi16(b, 8); __m256i res_lo = _mm256_and_si256(_mm256_mullo_epi16(a_lo, b_lo), mask00FF); @@ -1022,9 +1599,19 @@ Vectorized inline operator*(const Vectorized& a, const Vectori } template <> +<<<<<<< HEAD Vectorized inline minimum(const Vectorized& a, const Vectorized& b) { #ifndef CPU_CAPABILITY_AVX2 return emulate(a, b, [](int64_t a_point, int64_t b_point) {return std::min(a_point, b_point);}); +======= +Vectorized inline minimum( + const Vectorized& a, + const Vectorized& b) { +#ifndef CPU_CAPABILITY_AVX2 + return emulate(a, b, [](int64_t a_point, int64_t b_point) { + return std::min(a_point, b_point); + }); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #else __m256i cmp = _mm256_cmpgt_epi64(a, b); return _mm256_blendv_epi8(a, b, cmp); @@ -1032,29 +1619,63 @@ Vectorized inline minimum(const Vectorized& a, const Vectorize } template <> +<<<<<<< HEAD Vectorized inline minimum(const Vectorized& a, const Vectorized& b) { +======= +Vectorized inline minimum( + const Vectorized& a, + const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return _mm256_min_epi32(a, b); } template <> +<<<<<<< HEAD Vectorized inline minimum(const Vectorized& a, const Vectorized& b) { +======= +Vectorized inline minimum( + const Vectorized& a, + const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return _mm256_min_epi16(a, b); } template <> +<<<<<<< HEAD Vectorized inline minimum(const Vectorized& a, const Vectorized& b) { +======= +Vectorized inline minimum( + const Vectorized& a, + const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return _mm256_min_epi8(a, b); } template <> +<<<<<<< HEAD Vectorized inline minimum(const Vectorized& a, const Vectorized& b) { +======= +Vectorized inline minimum( + const Vectorized& a, + const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return _mm256_min_epu8(a, b); } template <> +<<<<<<< HEAD Vectorized inline maximum(const Vectorized& a, const Vectorized& b) { #ifndef CPU_CAPABILITY_AVX2 return emulate(a, b, [](int64_t a_point, int64_t b_point) {return std::max(a_point, b_point);}); +======= +Vectorized inline maximum( + const Vectorized& a, + const Vectorized& b) { +#ifndef CPU_CAPABILITY_AVX2 + return emulate(a, b, [](int64_t a_point, int64_t b_point) { + return std::max(a_point, b_point); + }); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #else __m256i cmp = _mm256_cmpgt_epi64(a, b); return _mm256_blendv_epi8(b, a, cmp); @@ -1062,108 +1683,238 @@ Vectorized inline maximum(const Vectorized& a, const Vectorize } template <> +<<<<<<< HEAD Vectorized inline maximum(const Vectorized& a, const Vectorized& b) { +======= +Vectorized inline maximum( + const Vectorized& a, + const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return _mm256_max_epi32(a, b); } template <> +<<<<<<< HEAD Vectorized inline maximum(const Vectorized& a, const Vectorized& b) { +======= +Vectorized inline maximum( + const Vectorized& a, + const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return _mm256_max_epi16(a, b); } template <> +<<<<<<< HEAD Vectorized inline maximum(const Vectorized& a, const Vectorized& b) { +======= +Vectorized inline maximum( + const Vectorized& a, + const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return _mm256_max_epi8(a, b); } template <> +<<<<<<< HEAD Vectorized inline maximum(const Vectorized& a, const Vectorized& b) { +======= +Vectorized inline maximum( + const Vectorized& a, + const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return _mm256_max_epu8(a, b); } template <> +<<<<<<< HEAD Vectorized inline clamp(const Vectorized& a, const Vectorized& min_val, const Vectorized& max_val) { #ifndef CPU_CAPABILITY_AVX2 return emulate(a, min_val, max_val, [](int64_t a_point, int64_t min_point, int64_t max_point) {return std::min(max_point, std::max(a_point, min_point));}); +======= +Vectorized inline clamp( + const Vectorized& a, + const Vectorized& min_val, + const Vectorized& max_val) { +#ifndef CPU_CAPABILITY_AVX2 + return emulate( + a, + min_val, + max_val, + [](int64_t a_point, int64_t min_point, int64_t max_point) { + return std::min(max_point, std::max(a_point, min_point)); + }); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #else return minimum(maximum(a, min_val), max_val); #endif } template <> +<<<<<<< HEAD Vectorized inline clamp(const Vectorized& a, const Vectorized& min_val, const Vectorized& max_val) { +======= +Vectorized inline clamp( + const Vectorized& a, + const Vectorized& min_val, + const Vectorized& max_val) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return _mm256_min_epi32(max_val, _mm256_max_epi32(a, min_val)); } template <> +<<<<<<< HEAD Vectorized inline clamp(const Vectorized& a, const Vectorized& min_val, const Vectorized& max_val) { +======= +Vectorized inline clamp( + const Vectorized& a, + const Vectorized& min_val, + const Vectorized& max_val) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return _mm256_min_epi16(max_val, _mm256_max_epi16(a, min_val)); } template <> +<<<<<<< HEAD Vectorized inline clamp(const Vectorized& a, const Vectorized& min_val, const Vectorized& max_val) { +======= +Vectorized inline clamp( + const Vectorized& a, + const Vectorized& min_val, + const Vectorized& max_val) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return _mm256_min_epi8(max_val, _mm256_max_epi8(a, min_val)); } template <> +<<<<<<< HEAD Vectorized inline clamp(const Vectorized& a, const Vectorized& min_val, const Vectorized& max_val) { +======= +Vectorized inline clamp( + const Vectorized& a, + const Vectorized& min_val, + const Vectorized& max_val) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return _mm256_min_epu8(max_val, _mm256_max_epu8(a, min_val)); } template <> +<<<<<<< HEAD Vectorized inline clamp_max(const Vectorized& a, const Vectorized& max_val) { #ifndef CPU_CAPABILITY_AVX2 return emulate(a, max_val, [](int64_t a_point, int64_t max_point) {return std::min(max_point, a_point);}); +======= +Vectorized inline clamp_max( + const Vectorized& a, + const Vectorized& max_val) { +#ifndef CPU_CAPABILITY_AVX2 + return emulate(a, max_val, [](int64_t a_point, int64_t max_point) { + return std::min(max_point, a_point); + }); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #else return minimum(max_val, a); #endif } template <> +<<<<<<< HEAD Vectorized inline clamp_max(const Vectorized& a, const Vectorized& max_val) { +======= +Vectorized inline clamp_max( + const Vectorized& a, + const Vectorized& max_val) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return _mm256_min_epi32(max_val, a); } template <> +<<<<<<< HEAD Vectorized inline clamp_max(const Vectorized& a, const Vectorized& max_val) { +======= +Vectorized inline clamp_max( + const Vectorized& a, + const Vectorized& max_val) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return _mm256_min_epi16(max_val, a); } template <> +<<<<<<< HEAD Vectorized inline clamp_max(const Vectorized& a, const Vectorized& max_val) { +======= +Vectorized inline clamp_max( + const Vectorized& a, + const Vectorized& max_val) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return _mm256_min_epi8(max_val, a); } template <> +<<<<<<< HEAD Vectorized inline clamp_max(const Vectorized& a, const Vectorized& max_val) { +======= +Vectorized inline clamp_max( + const Vectorized& a, + const Vectorized& max_val) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return _mm256_min_epu8(max_val, a); } template <> +<<<<<<< HEAD Vectorized inline clamp_min(const Vectorized& a, const Vectorized& min_val) { #ifndef CPU_CAPABILITY_AVX2 return emulate(a, min_val, [](int64_t a_point, int64_t min_point) {return std::max(min_point, a_point);}); +======= +Vectorized inline clamp_min( + const Vectorized& a, + const Vectorized& min_val) { +#ifndef CPU_CAPABILITY_AVX2 + return emulate(a, min_val, [](int64_t a_point, int64_t min_point) { + return std::max(min_point, a_point); + }); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #else return maximum(min_val, a); #endif } template <> +<<<<<<< HEAD Vectorized inline clamp_min(const Vectorized& a, const Vectorized& min_val) { +======= +Vectorized inline clamp_min( + const Vectorized& a, + const Vectorized& min_val) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return _mm256_max_epi32(min_val, a); } template <> +<<<<<<< HEAD Vectorized inline clamp_min(const Vectorized& a, const Vectorized& min_val) { +======= +Vectorized inline clamp_min( + const Vectorized& a, + const Vectorized& min_val) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return _mm256_max_epi16(min_val, a); } template <> +<<<<<<< HEAD Vectorized inline clamp_min(const Vectorized& a, const Vectorized& min_val) { +======= +Vectorized inline clamp_min( + const Vectorized& a, + const Vectorized& min_val) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return _mm256_max_epi8(min_val, a); } template <> +<<<<<<< HEAD Vectorized inline clamp_min(const Vectorized& a, const Vectorized& min_val) { return _mm256_max_epu8(min_val, a); } @@ -1179,17 +1930,52 @@ std::enable_if_t, Vectorized> inline convert_to_int32(const int8_t* ptr, int count=Vectorized::size()) { if (count == Vectorized::size()) { return _mm256_cvtepi8_epi32(_mm_loadl_epi64(reinterpret_cast(ptr))); +======= +Vectorized inline clamp_min( + const Vectorized& a, + const Vectorized& min_val) { + return _mm256_max_epu8(min_val, a); +} + +template +std::enable_if_t< + !(std::is_same_v || std::is_same_v), + Vectorized< + int32_t>> inline convert_to_int32(const T* ptr, int count = Vectorized::size()) { + return Vectorized::loadu(ptr, count); +} + +template +std:: + enable_if_t, Vectorized> inline convert_to_int32( + const int8_t* ptr, + int count = Vectorized::size()) { + if (count == Vectorized::size()) { + return _mm256_cvtepi8_epi32( + _mm_loadl_epi64(reinterpret_cast(ptr))); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } else { auto a = Vectorized::loadu(ptr, count); return _mm256_cvtepi8_epi32(_mm256_castsi256_si128(a)); } } +<<<<<<< HEAD template std::enable_if_t, Vectorized> inline convert_to_int32(const uint8_t* ptr, int count=Vectorized::size()) { if (count == Vectorized::size()) { return _mm256_cvtepu8_epi32(_mm_loadl_epi64(reinterpret_cast(ptr))); +======= +template +std:: + enable_if_t, Vectorized> inline convert_to_int32( + const uint8_t* ptr, + int count = Vectorized::size()) { + if (count == Vectorized::size()) { + return _mm256_cvtepu8_epi32( + _mm_loadl_epi64(reinterpret_cast(ptr))); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } else { auto a = Vectorized::loadu(ptr, count); return _mm256_cvtepu8_epi32(_mm256_castsi256_si128(a)); @@ -1197,6 +1983,7 @@ inline convert_to_int32(const uint8_t* ptr, int count=Vectorized::size( } template <> +<<<<<<< HEAD Vectorized inline operator/(const Vectorized& a, const Vectorized& b) { return int_elementwise_binary_256(a, b, std::divides()); } @@ -1230,10 +2017,72 @@ inline Vectorized operator^(const Vectorized& a, const Vectorized& b) { return _mm256_xor_si256(a, b); } template>::value, int> = 0> +======= +Vectorized inline operator/( + const Vectorized& a, + const Vectorized& b) { + return int_elementwise_binary_256(a, b, std::divides()); +} +template <> +Vectorized inline operator/( + const Vectorized& a, + const Vectorized& b) { + return int_elementwise_binary_256(a, b, std::divides()); +} +template <> +Vectorized inline operator/( + const Vectorized& a, + const Vectorized& b) { + return int_elementwise_binary_256(a, b, std::divides()); +} +template <> +Vectorized inline operator/( + const Vectorized& a, + const Vectorized& b) { + return int_elementwise_binary_256(a, b, std::divides()); +} +template <> +Vectorized inline operator/( + const Vectorized& a, + const Vectorized& b) { + return int_elementwise_binary_256(a, b, std::divides()); +} + +template < + class T, + typename std::enable_if_t< + std::is_base_of>::value, + int> = 0> +inline Vectorized operator&(const Vectorized& a, const Vectorized& b) { + return _mm256_and_si256(a, b); +} +template < + class T, + typename std::enable_if_t< + std::is_base_of>::value, + int> = 0> +inline Vectorized operator|(const Vectorized& a, const Vectorized& b) { + return _mm256_or_si256(a, b); +} +template < + class T, + typename std::enable_if_t< + std::is_base_of>::value, + int> = 0> +inline Vectorized operator^(const Vectorized& a, const Vectorized& b) { + return _mm256_xor_si256(a, b); +} +template < + class T, + typename std::enable_if_t< + std::is_base_of>::value, + int> = 0> +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) inline Vectorized operator~(const Vectorized& a) { return _mm256_xor_si256(a, _mm256_set1_epi32(-1)); } +<<<<<<< HEAD inline Vectorized Vectorized::eq(const Vectorized& other) const { return (*this == other) & Vectorized(1); } @@ -1351,11 +2200,166 @@ inline Vectorized Vectorized::lt(const Vectorized& ot } inline Vectorized Vectorized::le(const Vectorized& other) const { +======= +inline Vectorized Vectorized::eq( + const Vectorized& other) const { + return (*this == other) & Vectorized(1); +} + +inline Vectorized Vectorized::ne( + const Vectorized& other) const { + return (*this != other) & Vectorized(1); +} + +inline Vectorized Vectorized::gt( + const Vectorized& other) const { + return (*this > other) & Vectorized(1); +} + +inline Vectorized Vectorized::ge( + const Vectorized& other) const { + return (*this >= other) & Vectorized(1); +} + +inline Vectorized Vectorized::lt( + const Vectorized& other) const { + return (*this < other) & Vectorized(1); +} + +inline Vectorized Vectorized::le( + const Vectorized& other) const { + return (*this <= other) & Vectorized(1); +} + +inline Vectorized Vectorized::eq( + const Vectorized& other) const { + return (*this == other) & Vectorized(1); +} + +inline Vectorized Vectorized::ne( + const Vectorized& other) const { + return (*this != other) & Vectorized(1); +} + +inline Vectorized Vectorized::gt( + const Vectorized& other) const { + return (*this > other) & Vectorized(1); +} + +inline Vectorized Vectorized::ge( + const Vectorized& other) const { + return (*this >= other) & Vectorized(1); +} + +inline Vectorized Vectorized::lt( + const Vectorized& other) const { + return (*this < other) & Vectorized(1); +} + +inline Vectorized Vectorized::le( + const Vectorized& other) const { + return (*this <= other) & Vectorized(1); +} + +inline Vectorized Vectorized::eq( + const Vectorized& other) const { + return (*this == other) & Vectorized(1); +} + +inline Vectorized Vectorized::ne( + const Vectorized& other) const { + return (*this != other) & Vectorized(1); +} + +inline Vectorized Vectorized::gt( + const Vectorized& other) const { + return (*this > other) & Vectorized(1); +} + +inline Vectorized Vectorized::ge( + const Vectorized& other) const { + return (*this >= other) & Vectorized(1); +} + +inline Vectorized Vectorized::lt( + const Vectorized& other) const { + return (*this < other) & Vectorized(1); +} + +inline Vectorized Vectorized::le( + const Vectorized& other) const { + return (*this <= other) & Vectorized(1); +} + +inline Vectorized Vectorized::eq( + const Vectorized& other) const { + return (*this == other) & Vectorized(1); +} + +inline Vectorized Vectorized::ne( + const Vectorized& other) const { + return (*this != other) & Vectorized(1); +} + +inline Vectorized Vectorized::gt( + const Vectorized& other) const { + return (*this > other) & Vectorized(1); +} + +inline Vectorized Vectorized::ge( + const Vectorized& other) const { + return (*this >= other) & Vectorized(1); +} + +inline Vectorized Vectorized::lt( + const Vectorized& other) const { + return (*this < other) & Vectorized(1); +} + +inline Vectorized Vectorized::le( + const Vectorized& other) const { + return (*this <= other) & Vectorized(1); +} + +inline Vectorized Vectorized::eq( + const Vectorized& other) const { + return (*this == other) & Vectorized(1); +} + +inline Vectorized Vectorized::ne( + const Vectorized& other) const { + return (*this != other) & Vectorized(1); +} + +inline Vectorized Vectorized::gt( + const Vectorized& other) const { + return (*this > other) & Vectorized(1); +} + +inline Vectorized Vectorized::ge( + const Vectorized& other) const { + return (*this >= other) & Vectorized(1); +} + +inline Vectorized Vectorized::lt( + const Vectorized& other) const { + return (*this < other) & Vectorized(1); +} + +inline Vectorized Vectorized::le( + const Vectorized& other) const { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return (*this <= other) & Vectorized(1); } template +<<<<<<< HEAD Vectorized inline shift_256_16(const Vectorized& a, const Vectorized& b) { +======= +Vectorized inline shift_256_16( + const Vectorized& a, + const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // No vector instruction for shifting int16_t, so emulating it instead. // Control masks for shuffle operation, treating 256 bits as an @@ -1364,6 +2368,7 @@ Vectorized inline shift_256_16(const Vectorized& a, const Vect // M!=N) is set so that shuffle will move element with index M from // input pair into element with index N in output pair, and element // with index M in output pair will be set to all 0s. +<<<<<<< HEAD __m256i ctl_0_1 = _mm256_set_epi8(29, 28, 0x80, 0x80, 25, 24, 0x80, 0x80, 21, 20, 0x80, 0x80, 17, 16, 0x80, 0x80, 13, 12, 0x80, 0x80, 9, 8, 0x80, 0x80, @@ -1372,6 +2377,74 @@ Vectorized inline shift_256_16(const Vectorized& a, const Vect 0x80, 0x80, 23, 22, 0x80, 0x80, 19, 18, 0x80, 0x80, 15, 14, 0x80, 0x80, 11, 10, 0x80, 0x80, 7, 6, 0x80, 0x80, 3, 2); +======= + __m256i ctl_0_1 = _mm256_set_epi8( + 29, + 28, + 0x80, + 0x80, + 25, + 24, + 0x80, + 0x80, + 21, + 20, + 0x80, + 0x80, + 17, + 16, + 0x80, + 0x80, + 13, + 12, + 0x80, + 0x80, + 9, + 8, + 0x80, + 0x80, + 5, + 4, + 0x80, + 0x80, + 1, + 0, + 0x80, + 0x80); + __m256i ctl_1_0 = _mm256_set_epi8( + 0x80, + 0x80, + 31, + 30, + 0x80, + 0x80, + 27, + 26, + 0x80, + 0x80, + 23, + 22, + 0x80, + 0x80, + 19, + 18, + 0x80, + 0x80, + 15, + 14, + 0x80, + 0x80, + 11, + 10, + 0x80, + 0x80, + 7, + 6, + 0x80, + 0x80, + 3, + 2); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // Masks for bitwise and operation, treating 256 bits as an array of // 16-bit elements, and considering them in pairs of neighboring @@ -1423,8 +2496,20 @@ Vectorized inline shift_256_16(const Vectorized& a, const Vect return c; } +<<<<<<< HEAD template || std::is_same_v, int> = 0> Vectorized inline shift_256_8(const Vectorized& a, const Vectorized& b) { +======= +template < + bool left_shift, + typename T, + typename std::enable_if_t< + std::is_same_v || std::is_same_v, + int> = 0> +Vectorized inline shift_256_8( + const Vectorized& a, + const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // No vector instruction for shifting int8_t/uint8_t, so emulating // it instead. @@ -1435,6 +2520,7 @@ Vectorized inline shift_256_8(const Vectorized& a, const Vectorized& b) // with index M from input quadruple into element with index N in // output quadruple, and other elements in output quadruple will be // set to all 0s. +<<<<<<< HEAD __m256i ctl_0_3 = _mm256_set_epi8(28, 0x80, 0x80, 0x80, 24, 0x80, 0x80, 0x80, 20, 0x80, 0x80, 0x80, 16, 0x80, 0x80, 0x80, 12, 0x80, 0x80, 0x80, 8, 0x80, 0x80, 0x80, @@ -1467,6 +2553,272 @@ Vectorized inline shift_256_8(const Vectorized& a, const Vectorized& b) 0x80, 23, 0x80, 0x80, 0x80, 19, 0x80, 0x80, 0x80, 15, 0x80, 0x80, 0x80, 11, 0x80, 0x80, 0x80, 7, 0x80, 0x80, 0x80, 3, 0x80, 0x80); +======= + __m256i ctl_0_3 = _mm256_set_epi8( + 28, + 0x80, + 0x80, + 0x80, + 24, + 0x80, + 0x80, + 0x80, + 20, + 0x80, + 0x80, + 0x80, + 16, + 0x80, + 0x80, + 0x80, + 12, + 0x80, + 0x80, + 0x80, + 8, + 0x80, + 0x80, + 0x80, + 4, + 0x80, + 0x80, + 0x80, + 0, + 0x80, + 0x80, + 0x80); + __m256i ctl_1_0 = _mm256_set_epi8( + 0x80, + 0x80, + 0x80, + 29, + 0x80, + 0x80, + 0x80, + 25, + 0x80, + 0x80, + 0x80, + 21, + 0x80, + 0x80, + 0x80, + 17, + 0x80, + 0x80, + 0x80, + 13, + 0x80, + 0x80, + 0x80, + 9, + 0x80, + 0x80, + 0x80, + 5, + 0x80, + 0x80, + 0x80, + 1); + __m256i ctl_1_3 = _mm256_set_epi8( + 29, + 0x80, + 0x80, + 0x80, + 25, + 0x80, + 0x80, + 0x80, + 21, + 0x80, + 0x80, + 0x80, + 17, + 0x80, + 0x80, + 0x80, + 13, + 0x80, + 0x80, + 0x80, + 9, + 0x80, + 0x80, + 0x80, + 5, + 0x80, + 0x80, + 0x80, + 1, + 0x80, + 0x80, + 0x80); + __m256i ctl_2_0 = _mm256_set_epi8( + 0x80, + 0x80, + 0x80, + 30, + 0x80, + 0x80, + 0x80, + 26, + 0x80, + 0x80, + 0x80, + 22, + 0x80, + 0x80, + 0x80, + 18, + 0x80, + 0x80, + 0x80, + 14, + 0x80, + 0x80, + 0x80, + 10, + 0x80, + 0x80, + 0x80, + 6, + 0x80, + 0x80, + 0x80, + 2); + __m256i ctl_2_3 = _mm256_set_epi8( + 30, + 0x80, + 0x80, + 0x80, + 26, + 0x80, + 0x80, + 0x80, + 22, + 0x80, + 0x80, + 0x80, + 18, + 0x80, + 0x80, + 0x80, + 14, + 0x80, + 0x80, + 0x80, + 10, + 0x80, + 0x80, + 0x80, + 6, + 0x80, + 0x80, + 0x80, + 2, + 0x80, + 0x80, + 0x80); + __m256i ctl_3_0 = _mm256_set_epi8( + 0x80, + 0x80, + 0x80, + 31, + 0x80, + 0x80, + 0x80, + 27, + 0x80, + 0x80, + 0x80, + 23, + 0x80, + 0x80, + 0x80, + 19, + 0x80, + 0x80, + 0x80, + 15, + 0x80, + 0x80, + 0x80, + 11, + 0x80, + 0x80, + 0x80, + 7, + 0x80, + 0x80, + 0x80, + 3); + __m256i ctl_3_1 = _mm256_set_epi8( + 0x80, + 0x80, + 31, + 0x80, + 0x80, + 0x80, + 27, + 0x80, + 0x80, + 0x80, + 23, + 0x80, + 0x80, + 0x80, + 19, + 0x80, + 0x80, + 0x80, + 15, + 0x80, + 0x80, + 0x80, + 11, + 0x80, + 0x80, + 0x80, + 7, + 0x80, + 0x80, + 0x80, + 3, + 0x80); + __m256i ctl_3_2 = _mm256_set_epi8( + 0x80, + 31, + 0x80, + 0x80, + 0x80, + 27, + 0x80, + 0x80, + 0x80, + 23, + 0x80, + 0x80, + 0x80, + 19, + 0x80, + 0x80, + 0x80, + 15, + 0x80, + 0x80, + 0x80, + 11, + 0x80, + 0x80, + 0x80, + 7, + 0x80, + 0x80, + 0x80, + 3, + 0x80, + 0x80); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // Masks for bitwise and operation, treating 256 bits as an array of // 8-bit elements, and considering them in quadruples of neighboring @@ -1497,11 +2849,18 @@ Vectorized inline shift_256_8(const Vectorized& a, const Vectorized& b) __m256i c0; if (left_shift) c0 = _mm256_sllv_epi32(a0, b0); +<<<<<<< HEAD else if constexpr (std::is_same_v) c0 = _mm256_srav_epi32(a0, b0); else c0 = _mm256_srlv_epi32(a0, b0); +======= + else if constexpr (std::is_same_v) + c0 = _mm256_srav_epi32(a0, b0); + else + c0 = _mm256_srlv_epi32(a0, b0); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) c0 = _mm256_shuffle_epi8(c0, ctl_3_0); // Peform shifting the same way for input array elements with @@ -1511,11 +2870,18 @@ Vectorized inline shift_256_8(const Vectorized& a, const Vectorized& b) __m256i c1; if (left_shift) c1 = _mm256_sllv_epi32(a1, b1); +<<<<<<< HEAD else if constexpr (std::is_same_v) c1 = _mm256_srav_epi32(a1, b1); else c1 = _mm256_srlv_epi32(a1, b1); +======= + else if constexpr (std::is_same_v) + c1 = _mm256_srav_epi32(a1, b1); + else + c1 = _mm256_srlv_epi32(a1, b1); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) c1 = _mm256_shuffle_epi8(c1, ctl_3_1); // Peform shifting the same way for input array elements with @@ -1525,25 +2891,43 @@ Vectorized inline shift_256_8(const Vectorized& a, const Vectorized& b) __m256i c2; if (left_shift) c2 = _mm256_sllv_epi32(a2, b2); +<<<<<<< HEAD else if constexpr (std::is_same_v) c2 = _mm256_srav_epi32(a2, b2); else c2 = _mm256_srlv_epi32(a2, b2); +======= + else if constexpr (std::is_same_v) + c2 = _mm256_srav_epi32(a2, b2); + else + c2 = _mm256_srlv_epi32(a2, b2); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) c2 = _mm256_shuffle_epi8(c2, ctl_3_2); // Peform shifting the same way for input array elements with // idx%4==3. +<<<<<<< HEAD __m256i a3 = _mm256_and_si256(a, keep_3); +======= + __m256i a3 = _mm256_and_si256(a, keep_3); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) __m256i b3 = _mm256_shuffle_epi8(b, ctl_3_0); __m256i c3; if (left_shift) c3 = _mm256_sllv_epi32(a3, b3); +<<<<<<< HEAD else if constexpr (std::is_same_v) c3 = _mm256_srav_epi32(a3, b3); else c3 = _mm256_srlv_epi32(a3, b3); +======= + else if constexpr (std::is_same_v) + c3 = _mm256_srav_epi32(a3, b3); + else + c3 = _mm256_srlv_epi32(a3, b3); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) c3 = _mm256_and_si256(c3, keep_3); // Merge partial results into the final result. @@ -1555,31 +2939,62 @@ Vectorized inline shift_256_8(const Vectorized& a, const Vectorized& b) } template <> +<<<<<<< HEAD Vectorized inline operator<<(const Vectorized& a, const Vectorized& b) { +======= +Vectorized inline operator<<( + const Vectorized& a, + const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return _mm256_sllv_epi64(a, b); } template <> +<<<<<<< HEAD Vectorized inline operator<<(const Vectorized& a, const Vectorized& b) { +======= +Vectorized inline operator<<( + const Vectorized& a, + const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return _mm256_sllv_epi32(a, b); } template <> +<<<<<<< HEAD Vectorized inline operator<<(const Vectorized& a, const Vectorized& b) { +======= +Vectorized inline operator<<( + const Vectorized& a, + const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return shift_256_16(a, b); } template <> +<<<<<<< HEAD Vectorized inline operator<<(const Vectorized& a, const Vectorized& b) { +======= +Vectorized inline operator<<( + const Vectorized& a, + const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return shift_256_8(a, b); } template <> +<<<<<<< HEAD Vectorized inline operator<<(const Vectorized& a, const Vectorized& b) { +======= +Vectorized inline operator<<( + const Vectorized& a, + const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return shift_256_8(a, b); } template <> +<<<<<<< HEAD Vectorized inline operator>>(const Vectorized& a, const Vectorized& b) { // No vector instruction for right arithmetic shifting int64_t, so emulating it // instead. @@ -1589,6 +3004,20 @@ Vectorized inline operator>>(const Vectorized& a, const Vector __m256i zero = _mm256_set1_epi64x(0); __m256i max_shift = _mm256_set1_epi64x(64); __m256i mask = _mm256_or_si256(_mm256_cmpgt_epi64(zero, b), _mm256_cmpgt_epi64(b, max_shift)); +======= +Vectorized inline operator>>( + const Vectorized& a, + const Vectorized& b) { + // No vector instruction for right arithmetic shifting int64_t, so emulating + // it instead. + + // Clamp the shift values such that shift values < 0 and > 64 are changed to + // 64 which results in -1 for negative input and 0 for non-negative input. + __m256i zero = _mm256_set1_epi64x(0); + __m256i max_shift = _mm256_set1_epi64x(64); + __m256i mask = _mm256_or_si256( + _mm256_cmpgt_epi64(zero, b), _mm256_cmpgt_epi64(b, max_shift)); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) __m256i shift = _mm256_blendv_epi8(b, max_shift, mask); // Shift the number logically to the right, thus filling the most // significant bits with 0s. Then, replace these bits with the sign @@ -1603,25 +3032,54 @@ Vectorized inline operator>>(const Vectorized& a, const Vector } template <> +<<<<<<< HEAD Vectorized inline operator>>(const Vectorized& a, const Vectorized& b) { +======= +Vectorized inline operator>>( + const Vectorized& a, + const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return _mm256_srav_epi32(a, b); } template <> +<<<<<<< HEAD Vectorized inline operator>>(const Vectorized& a, const Vectorized& b) { +======= +Vectorized inline operator>>( + const Vectorized& a, + const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return shift_256_16(a, b); } template <> +<<<<<<< HEAD Vectorized inline operator>>(const Vectorized& a, const Vectorized& b) { +======= +Vectorized inline operator>>( + const Vectorized& a, + const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return shift_256_8(a, b); } template <> +<<<<<<< HEAD Vectorized inline operator>>(const Vectorized& a, const Vectorized& b) { +======= +Vectorized inline operator>>( + const Vectorized& a, + const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return shift_256_8(a, b); } #endif +<<<<<<< HEAD }} // namespace at::vec::CPU_CAPABILITY +======= +} // namespace CPU_CAPABILITY +} // namespace at::vec +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_qint.h b/aten/src/ATen/cpu/vec/vec256/vec256_qint.h index 9b900cd0f63e..99ceb6eba8d4 100644 --- a/aten/src/ATen/cpu/vec/vec256/vec256_qint.h +++ b/aten/src/ATen/cpu/vec/vec256/vec256_qint.h @@ -35,8 +35,13 @@ // specified by float_vec_return_type. // // When writing kernels with these vectors, it is expected that floating- +<<<<<<< HEAD // point operations will be carried out in a loop over Vectorized::float_num_vecs // iterations. +======= +// point operations will be carried out in a loop over +// Vectorized::float_num_vecs iterations. +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) namespace at::vec { inline namespace CPU_CAPABILITY { @@ -103,10 +108,19 @@ inline __m256i pack_saturate_and_clamp( } template +<<<<<<< HEAD typename std::enable_if_t || std::is_same_v, at::vec::Vectorized> inline convert_int8_to_float(at::vec::Vectorized src) { // Note: this function only convert inputs number of elements equal to at::vec::Vectorized.size() // Only handle first 8*8 bits +======= +typename std::enable_if_t< + std::is_same_v || std::is_same_v, + at::vec::Vectorized< + float>> inline convert_int8_to_float(at::vec::Vectorized src) { + // Note: this function only convert inputs number of elements equal to + // at::vec::Vectorized.size() Only handle first 8*8 bits +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) __m128i input_128 = _mm256_castsi256_si128(src); // Convert from 8*uint8/int8 to 8*int32 __m256i input_256_int32; @@ -119,8 +133,15 @@ inline convert_int8_to_float(at::vec::Vectorized src) { } template +<<<<<<< HEAD typename std::enable_if_t || std::is_same_v, at::vec::Vectorized> inline convert_float_to_int8(at::vec::Vectorized src) { +======= +typename std::enable_if_t< + std::is_same_v || std::is_same_v, + at::vec::Vectorized< + T>> inline convert_float_to_int8(at::vec::Vectorized src) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // Convert from float32 to int32 with truncation __m256i x_values_int32 = _mm256_cvttps_epi32(src); @@ -131,10 +152,17 @@ inline convert_float_to_int8(at::vec::Vectorized src) { constexpr auto max_val = std::numeric_limits::max(); // Convert from int16 to uint8/int8 using unsigned saturation +<<<<<<< HEAD __m256i xyzw_clamped_v = pack_saturate_and_clamp( xy_packed_v, xy_packed_v, min_val, max_val); __m256i permute_mask_v = _mm256_set_epi32(0x07, 0x03, 0x06, 0x02, 0x05, 0x01, 0x04, 0x00); +======= + __m256i xyzw_clamped_v = + pack_saturate_and_clamp(xy_packed_v, xy_packed_v, min_val, max_val); + __m256i permute_mask_v = + _mm256_set_epi32(0x07, 0x03, 0x06, 0x02, 0x05, 0x01, 0x04, 0x00); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return _mm256_permutevar8x32_epi32(xyzw_clamped_v, permute_mask_v); } @@ -255,6 +283,7 @@ __FORCE_INLINE void QuantizeAvx2( } } +<<<<<<< HEAD template<> struct Vectorized : public Vectorizedqi { using size_type = int; @@ -384,6 +413,151 @@ struct Vectorized : public Vectorizedqi { template <> Vectorized inline maximum(const Vectorized& a, const Vectorized& b) { +======= +template <> +struct is_vec_specialized_for : std::bool_constant {}; + +template <> +struct Vectorized : public Vectorizedqi { + using size_type = int; + static constexpr size_type kSize = Vectorized::size(); + static constexpr size_type size() { + return kSize; + } + + static constexpr int kFloatNumVecs = kSize / Vectorized::size(); + static constexpr int float_num_vecs() { + return kFloatNumVecs; + } + + static constexpr int int_num_vecs() { + return 1; + } + + using float_vec_return_type = std::array, kFloatNumVecs>; + using int_vec_return_type = std::array, 1>; + using value_type = c10::qint32::underlying; + + public: + using Vectorizedqi::Vectorizedqi; + Vectorized() {} + + Vectorized(__m256i vals_) { + vals = vals_; + } + + // Broadcast constructor + Vectorized(const c10::qint32& val) { + value_type uw = val.val_; + vals = _mm256_set1_epi32(uw); + } + + void store(void* ptr, int count = size()) const { + if (count != size()) { + memcpy(ptr, &vals, count * sizeof(value_type)); + } else { + _mm256_storeu_si256((__m256i*)ptr, vals); + } + } + + static Vectorized loadu(const void* ptr) { + return Vectorized(ptr); + } + + static Vectorized loadu(const void* ptr, int64_t count) { + __at_align__ value_type tmp_values[size()]; + // Ensure uninitialized memory does not change the output value See + // https://github.com/pytorch/pytorch/issues/32502 for more details. We do + // not initialize arrays to zero using "={0}" because gcc would compile it + // to two instructions while a loop would be compiled to one instruction. + for (const auto i : c10::irange(size())) { + tmp_values[i] = 0; + } + std::memcpy( + tmp_values, + reinterpret_cast(ptr), + count * sizeof(value_type)); + return _mm256_loadu_si256((const __m256i*)tmp_values); + } + + float_vec_return_type dequantize( + Vectorized scale, + Vectorized /*zero_point*/, + Vectorized scale_zp_premul) const { + __m256 float_vals = _mm256_cvtepi32_ps(vals); + return {vec::fmadd(scale, Vectorized(float_vals), scale_zp_premul)}; + } + + float_vec_return_type dequantize( + Vectorized scale, + Vectorized zero_point) const { + __m256 float_vals = _mm256_cvtepi32_ps(vals); + return {(Vectorized(float_vals) - zero_point) * scale}; + } + + static Vectorized quantize( + const float_vec_return_type& rhs, + float scale, + int32_t zero_point, + float /*inverse_scale*/) { + Vectorized retval; + auto rhs_data = (__m256)rhs[0]; + at::native::quantize_vec( + scale, + zero_point, + (float*)&rhs_data, + (c10::qint32*)&retval.vals, + size()); + return retval; + } + + Vectorized maximum(Vectorized b) const { + return _mm256_max_epi32(vals, b.vals); + } + + Vectorized minimum(Vectorized b) const { + return _mm256_min_epi32(vals, b.vals); + } + + Vectorized relu(Vectorized zero_point) const { + return maximum(zero_point); + } + + Vectorized relu6( + Vectorized zero_point, + Vectorized q_six) { + return _mm256_min_epi32( + _mm256_max_epi32(vals, zero_point.vals), q_six.vals); + } + + int_vec_return_type widening_subtract(Vectorized b) const { + return {_mm256_sub_epi32(vals, b)}; + } + + static Vectorized requantize_from_int( + const int_vec_return_type& inp, + float multiplier, + int32_t zero_point) { + __m256 multiplier_v = _mm256_set1_ps(multiplier); + __m256i zero_point_v = _mm256_set1_epi32(zero_point); + + __m256 scaled = _mm256_mul_ps(_mm256_cvtepi32_ps(inp[0]), multiplier_v); + __m256i rounded = _mm256_cvtps_epi32(scaled); + return _mm256_add_epi32(rounded, zero_point_v); + } + + private: + // Load from memory constructor + Vectorized(const void* ptr) { + vals = _mm256_loadu_si256((const __m256i*)ptr); + } +}; + +template <> +Vectorized inline maximum( + const Vectorized& a, + const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return a.maximum(b); } @@ -447,6 +621,7 @@ __m256i RequantizeAvx2( return xyzw_clamped_v; } +<<<<<<< HEAD template<> struct Vectorized : public Vectorizedqi { static constexpr int kSize = VECTOR_WIDTH; @@ -519,6 +694,88 @@ struct Vectorized : public Vectorizedqi { __m256i cvtepi8_epi32(__m128i epi8_vals) const { return _mm256_cvtepi8_epi32(epi8_vals); } +======= +template <> +struct is_vec_specialized_for : std::bool_constant {}; + +template <> +struct Vectorized : public Vectorizedqi { + static constexpr int kSize = VECTOR_WIDTH; + static constexpr int size() { + return kSize; + } + + static constexpr int kFloatNumVecs = kSize / Vectorized::size(); + static constexpr int float_num_vecs() { + return kFloatNumVecs; + } + + static constexpr int kIntNumVecs = kSize / Vectorized::size(); + static constexpr int int_num_vecs() { + return kIntNumVecs; + } + + using float_vec_return_type = std::array, kFloatNumVecs>; + using int_vec_return_type = std::array, kIntNumVecs>; + using value_type = typename c10::qint8::underlying; + + public: + using Vectorizedqi::Vectorizedqi; + + Vectorized() {} + Vectorized(__m256i vals_) { + vals = vals_; + } + + // Broadcast constructor + Vectorized(const c10::qint8& val) { + value_type uw = val.val_; + vals = _mm256_set1_epi8(uw); + } + + // This is needed because the compiler emits awful code for the default + // constructor for moving the enum + // NOLINTNEXTLINE(clang-diagnostic-deprecated-copy) + C10_CLANG_DIAGNOSTIC_PUSH() +#if C10_CLANG_HAS_WARNING("-Wdeprecated-copy") + C10_CLANG_DIAGNOSTIC_IGNORE("-Wdeprecated-copy") +#endif + Vectorized(const Vectorized& other) : Vectorizedqi(other.vals) {} + C10_CLANG_DIAGNOSTIC_POP() + + void store(void* ptr, int count = size()) const { + if (count != size()) { + memcpy(ptr, &vals, count * sizeof(value_type)); + } else { + _mm256_storeu_si256((__m256i*)ptr, vals); + } + } + + static Vectorized loadu(const void* ptr) { + return Vectorized(ptr); + } + + static Vectorized loadu(const void* ptr, int64_t count) { + __at_align__ value_type tmp_values[size()]; + // Ensure uninitialized memory does not change the output value See + // https://github.com/pytorch/pytorch/issues/32502 for more details. We do + // not initialize arrays to zero using "={0}" because gcc would compile it + // to two instructions while a loop would be compiled to one instruction. + for (const auto i : c10::irange(size())) { + tmp_values[i] = 0; + } + std::memcpy( + tmp_values, + reinterpret_cast(ptr), + count * sizeof(value_type)); + return _mm256_loadu_si256((const __m256i*)tmp_values); + } + + private: + __m256i cvtepi8_epi32(__m128i epi8_vals) const { + return _mm256_cvtepi8_epi32(epi8_vals); + } +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) public: float_vec_return_type dequantize( @@ -579,6 +836,7 @@ struct Vectorized : public Vectorizedqi { } Vectorized maximum(Vectorized b) const { +<<<<<<< HEAD return _mm256_max_epi8(vals, b.vals); } @@ -720,6 +978,159 @@ struct Vectorized : public Vectorizedqi { __m256i cvtepu8_epi32(__m128i epu8_vals) const { return _mm256_cvtepu8_epi32(epu8_vals); } +======= + return _mm256_max_epi8(vals, b.vals); + } + + Vectorized minimum(Vectorized b) const { + return _mm256_min_epi8(vals, b.vals); + } + + Vectorized relu(Vectorized zero_point) const { + return maximum(zero_point); + } + + Vectorized relu6( + Vectorized zero_point, + Vectorized q_six) { + return _mm256_min_epi8(_mm256_max_epi8(vals, zero_point.vals), q_six.vals); + } + + int_vec_return_type widening_subtract(Vectorized b) const { + __m128i int_val0 = _mm_set1_epi64x(_mm256_extract_epi64(vals, 0)); + __m128i int_val1 = _mm_set1_epi64x(_mm256_extract_epi64(vals, 1)); + __m128i int_val2 = _mm_set1_epi64x(_mm256_extract_epi64(vals, 2)); + __m128i int_val3 = _mm_set1_epi64x(_mm256_extract_epi64(vals, 3)); + + __m256i int32_val0 = cvtepi8_epi32(int_val0); + __m256i int32_val1 = cvtepi8_epi32(int_val1); + __m256i int32_val2 = cvtepi8_epi32(int_val2); + __m256i int32_val3 = cvtepi8_epi32(int_val3); + + __m128i int_b0 = _mm_set1_epi64x(_mm256_extract_epi64(b, 0)); + __m128i int_b1 = _mm_set1_epi64x(_mm256_extract_epi64(b, 1)); + __m128i int_b2 = _mm_set1_epi64x(_mm256_extract_epi64(b, 2)); + __m128i int_b3 = _mm_set1_epi64x(_mm256_extract_epi64(b, 3)); + + __m256i int32_b0 = cvtepi8_epi32(int_b0); + __m256i int32_b1 = cvtepi8_epi32(int_b1); + __m256i int32_b2 = cvtepi8_epi32(int_b2); + __m256i int32_b3 = cvtepi8_epi32(int_b3); + + __m256i res_0 = _mm256_sub_epi32(int32_val0, int32_b0); + __m256i res_1 = _mm256_sub_epi32(int32_val1, int32_b1); + __m256i res_2 = _mm256_sub_epi32(int32_val2, int32_b2); + __m256i res_3 = _mm256_sub_epi32(int32_val3, int32_b3); + + return { + Vectorized(res_0), + Vectorized(res_1), + Vectorized(res_2), + Vectorized(res_3)}; + } + + static Vectorized requantize_from_int( + const int_vec_return_type& inp, + float multiplier, + int32_t zero_point) { + __m256 multiplier_v = _mm256_set1_ps(multiplier); + __m256i zero_point_v = _mm256_set1_epi32(zero_point); + return RequantizeAvx2(inp, multiplier_v, zero_point_v); + } + + private: + // Load from memory constructor + Vectorized(const void* ptr) { + vals = _mm256_loadu_si256((const __m256i*)ptr); + } +}; + +template <> +Vectorized inline maximum( + const Vectorized& a, + const Vectorized& b) { + return a.maximum(b); +} + +template <> +struct is_vec_specialized_for : std::bool_constant {}; + +template <> +struct Vectorized : public Vectorizedqi { + static constexpr int kSize = VECTOR_WIDTH; + static constexpr int size() { + return kSize; + } + + static constexpr int kFloatNumVecs = kSize / Vectorized::size(); + static constexpr int float_num_vecs() { + return kFloatNumVecs; + } + + static constexpr int kIntNumVecs = kSize / Vectorized::size(); + static constexpr int int_num_vecs() { + return kIntNumVecs; + } + + using float_vec_return_type = std::array, kFloatNumVecs>; + using int_vec_return_type = std::array, kIntNumVecs>; + using value_type = typename c10::quint8::underlying; + + public: + using Vectorizedqi::Vectorizedqi; + Vectorized() {} + + Vectorized(__m256i vals_) { + vals = vals_; + } + + // Broadcast constructor + Vectorized(const c10::quint8& val) { + value_type uw = val.val_; + vals = _mm256_set1_epi8(uw); + } + + // NOLINTNEXTLINE(clang-diagnostic-deprecated-copy) + C10_CLANG_DIAGNOSTIC_PUSH() +#if C10_CLANG_HAS_WARNING("-Wdeprecated-copy") + C10_CLANG_DIAGNOSTIC_IGNORE("-Wdeprecated-copy") +#endif + Vectorized(const Vectorized& other) : Vectorizedqi(other.vals) {} + C10_CLANG_DIAGNOSTIC_POP() + + void store(void* ptr, int count = size()) const { + if (count != size()) { + memcpy(ptr, &vals, count * sizeof(value_type)); + } else { + _mm256_storeu_si256((__m256i*)ptr, vals); + } + } + + static Vectorized loadu(const void* ptr) { + return Vectorized(ptr); + } + + static Vectorized loadu(const void* ptr, int64_t count) { + __at_align__ value_type tmp_values[size()]; + // Ensure uninitialized memory does not change the output value See + // https://github.com/pytorch/pytorch/issues/32502 for more details. We do + // not initialize arrays to zero using "={0}" because gcc would compile it + // to two instructions while a loop would be compiled to one instruction. + for (const auto i : c10::irange(size())) { + tmp_values[i] = 0; + } + std::memcpy( + tmp_values, + reinterpret_cast(ptr), + count * sizeof(value_type)); + return _mm256_loadu_si256((const __m256i*)tmp_values); + } + + private: + __m256i cvtepu8_epi32(__m128i epu8_vals) const { + return _mm256_cvtepu8_epi32(epu8_vals); + } +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) public: float_vec_return_type dequantize( @@ -780,6 +1191,7 @@ struct Vectorized : public Vectorizedqi { } Vectorized maximum(Vectorized b) const { +<<<<<<< HEAD return _mm256_max_epu8(vals, b.vals); } @@ -848,6 +1260,77 @@ struct Vectorized : public Vectorizedqi { template <> Vectorized inline maximum(const Vectorized& a, const Vectorized& b) { +======= + return _mm256_max_epu8(vals, b.vals); + } + + Vectorized minimum(Vectorized b) const { + return _mm256_min_epu8(vals, b.vals); + } + + Vectorized relu(Vectorized zero_point) const { + return maximum(zero_point); + } + + Vectorized relu6( + Vectorized zero_point, + Vectorized q_six) { + return _mm256_min_epu8(_mm256_max_epu8(vals, zero_point.vals), q_six.vals); + } + + int_vec_return_type widening_subtract(Vectorized b) const { + __m128i int_val0 = _mm_set1_epi64x(_mm256_extract_epi64(vals, 0)); + __m128i int_val1 = _mm_set1_epi64x(_mm256_extract_epi64(vals, 1)); + __m128i int_val2 = _mm_set1_epi64x(_mm256_extract_epi64(vals, 2)); + __m128i int_val3 = _mm_set1_epi64x(_mm256_extract_epi64(vals, 3)); + + __m256i int32_val0 = cvtepu8_epi32(int_val0); + __m256i int32_val1 = cvtepu8_epi32(int_val1); + __m256i int32_val2 = cvtepu8_epi32(int_val2); + __m256i int32_val3 = cvtepu8_epi32(int_val3); + + __m128i int_b0 = _mm_set1_epi64x(_mm256_extract_epi64(b, 0)); + __m128i int_b1 = _mm_set1_epi64x(_mm256_extract_epi64(b, 1)); + __m128i int_b2 = _mm_set1_epi64x(_mm256_extract_epi64(b, 2)); + __m128i int_b3 = _mm_set1_epi64x(_mm256_extract_epi64(b, 3)); + + __m256i int32_b0 = cvtepu8_epi32(int_b0); + __m256i int32_b1 = cvtepu8_epi32(int_b1); + __m256i int32_b2 = cvtepu8_epi32(int_b2); + __m256i int32_b3 = cvtepu8_epi32(int_b3); + + __m256i res_0 = _mm256_sub_epi32(int32_val0, int32_b0); + __m256i res_1 = _mm256_sub_epi32(int32_val1, int32_b1); + __m256i res_2 = _mm256_sub_epi32(int32_val2, int32_b2); + __m256i res_3 = _mm256_sub_epi32(int32_val3, int32_b3); + return { + Vectorized(res_0), + Vectorized(res_1), + Vectorized(res_2), + Vectorized(res_3)}; + } + + static Vectorized requantize_from_int( + const int_vec_return_type& inp, + float multiplier, + int32_t zero_point) { + __m256 multiplier_v = _mm256_set1_ps(multiplier); + __m256i zero_point_v = _mm256_set1_epi32(zero_point); + return RequantizeAvx2(inp, multiplier_v, zero_point_v); + } + + private: + // Load from memory constructor + Vectorized(const void* ptr) { + vals = _mm256_loadu_si256((const __m256i*)ptr); + } +}; + +template <> +Vectorized inline maximum( + const Vectorized& a, + const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return a.maximum(b); } @@ -908,7 +1391,13 @@ struct VectorizedQuantizedConverter { float tmp_vals[Vectorized::size()]; for (const auto j : c10::irange(Vectorized::size())) { tmp_vals[j] = at::native::dequantize_val( +<<<<<<< HEAD scale[j], zero_point[j], T(vals[Vectorized::size() * i + j])); +======= + scale[j], + zero_point[j], + T(vals[Vectorized::size() * i + j])); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } rv[i] = Vectorized(tmp_vals); } @@ -928,10 +1417,17 @@ struct VectorizedQuantizedConverter { template <> struct Vectorized : public VectorizedQuantizedConverter< +<<<<<<< HEAD c10::qint32, std::array, 1>, std::array, 1>, Vectorized::size()> { +======= + c10::qint32, + std::array, 1>, + std::array, 1>, + Vectorized::size()> { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) using VectorizedQuantizedConverter::VectorizedQuantizedConverter; static Vectorized loadu(const void* ptr) { @@ -940,14 +1436,27 @@ struct Vectorized : public VectorizedQuantizedConverter< static Vectorized loadu(const void* ptr, int64_t count) { __at_align__ value_type tmp_values[size()]; +<<<<<<< HEAD // Ensure uninitialized memory does not change the output value See https://github.com/pytorch/pytorch/issues/32502 // for more details. We do not initialize arrays to zero using "={0}" because gcc would compile it to two // instructions while a loop would be compiled to one instruction. +======= + // Ensure uninitialized memory does not change the output value See + // https://github.com/pytorch/pytorch/issues/32502 for more details. We do + // not initialize arrays to zero using "={0}" because gcc would compile it + // to two instructions while a loop would be compiled to one instruction. +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) for (const auto i : c10::irange(size())) { tmp_values[i] = 0; } std::memcpy( +<<<<<<< HEAD tmp_values, reinterpret_cast(ptr), count * sizeof(value_type)); +======= + tmp_values, + reinterpret_cast(ptr), + count * sizeof(value_type)); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return Vectorized(tmp_values); } @@ -989,11 +1498,18 @@ struct Vectorized : public VectorizedQuantizedConverter< return retval; } +<<<<<<< HEAD Vectorized relu(Vectorized zero_point) const { return maximum(zero_point); } +======= + Vectorized relu(Vectorized zero_point) const { + return maximum(zero_point); + } + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) Vectorized relu6( Vectorized zero_point, Vectorized q_six) { @@ -1028,7 +1544,13 @@ struct Vectorized : public VectorizedQuantizedConverter< }; template <> +<<<<<<< HEAD Vectorized inline maximum(const Vectorized& a, const Vectorized& b) { +======= +Vectorized inline maximum( + const Vectorized& a, + const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return a.maximum(b); } @@ -1055,11 +1577,22 @@ Vectorized inline operator+( } template <> +<<<<<<< HEAD struct Vectorized : public VectorizedQuantizedConverter< c10::qint8, std::array, 4>, std::array, 4>, 4 * Vectorized::size()> { +======= +struct is_vec_specialized_for : std::bool_constant {}; + +template <> +struct Vectorized : public VectorizedQuantizedConverter< + c10::qint8, + std::array, 4>, + std::array, 4>, + 4 * Vectorized::size()> { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) using VectorizedQuantizedConverter::VectorizedQuantizedConverter; static Vectorized loadu(const void* ptr) { @@ -1068,14 +1601,27 @@ struct Vectorized : public VectorizedQuantizedConverter< static Vectorized loadu(const void* ptr, int64_t count) { __at_align__ value_type tmp_values[size()]; +<<<<<<< HEAD // Ensure uninitialized memory does not change the output value See https://github.com/pytorch/pytorch/issues/32502 // for more details. We do not initialize arrays to zero using "={0}" because gcc would compile it to two // instructions while a loop would be compiled to one instruction. +======= + // Ensure uninitialized memory does not change the output value See + // https://github.com/pytorch/pytorch/issues/32502 for more details. We do + // not initialize arrays to zero using "={0}" because gcc would compile it + // to two instructions while a loop would be compiled to one instruction. +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) for (const auto i : c10::irange(size())) { tmp_values[i] = 0; } std::memcpy( +<<<<<<< HEAD tmp_values, reinterpret_cast(ptr), count * sizeof(value_type)); +======= + tmp_values, + reinterpret_cast(ptr), + count * sizeof(value_type)); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return Vectorized(tmp_values); } @@ -1166,16 +1712,33 @@ struct Vectorized : public VectorizedQuantizedConverter< }; template <> +<<<<<<< HEAD Vectorized inline maximum(const Vectorized& a, const Vectorized& b) { +======= +Vectorized inline maximum( + const Vectorized& a, + const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return a.maximum(b); } template <> +<<<<<<< HEAD struct Vectorized : public VectorizedQuantizedConverter< c10::quint8, std::array, 4>, std::array, 4>, 4 * Vectorized::size()> { +======= +struct is_vec_specialized_for : std::bool_constant {}; + +template <> +struct Vectorized : public VectorizedQuantizedConverter< + c10::quint8, + std::array, 4>, + std::array, 4>, + 4 * Vectorized::size()> { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) using VectorizedQuantizedConverter::VectorizedQuantizedConverter; static Vectorized loadu(const void* ptr) { @@ -1184,14 +1747,27 @@ struct Vectorized : public VectorizedQuantizedConverter< static Vectorized loadu(const void* ptr, int64_t count) { __at_align__ value_type tmp_values[size()]; +<<<<<<< HEAD // Ensure uninitialized memory does not change the output value See https://github.com/pytorch/pytorch/issues/32502 // for more details. We do not initialize arrays to zero using "={0}" because gcc would compile it to two // instructions while a loop would be compiled to one instruction. +======= + // Ensure uninitialized memory does not change the output value See + // https://github.com/pytorch/pytorch/issues/32502 for more details. We do + // not initialize arrays to zero using "={0}" because gcc would compile it + // to two instructions while a loop would be compiled to one instruction. +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) for (const auto i : c10::irange(size())) { tmp_values[i] = 0; } std::memcpy( +<<<<<<< HEAD tmp_values, reinterpret_cast(ptr), count * sizeof(value_type)); +======= + tmp_values, + reinterpret_cast(ptr), + count * sizeof(value_type)); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return Vectorized(tmp_values); } @@ -1237,7 +1813,10 @@ struct Vectorized : public VectorizedQuantizedConverter< return maximum(zero_point); } +<<<<<<< HEAD +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) Vectorized relu6( Vectorized zero_point, Vectorized q_six) { @@ -1283,13 +1862,20 @@ struct Vectorized : public VectorizedQuantizedConverter< }; template <> +<<<<<<< HEAD Vectorized inline maximum(const Vectorized& a, const Vectorized& b) { +======= +Vectorized inline maximum( + const Vectorized& a, + const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return a.maximum(b); } #endif // if defined(CPU_CAPABILITY_AVX2) #if (defined(__aarch64__) && !defined(CPU_CAPABILITY_SVE256)) +<<<<<<< HEAD std::pair, Vectorized> inline convert_int8_to_float(at::vec::Vectorized src) { auto s8x8 = vld1_s8(src.operator const int8_t*()); @@ -1332,3 +1918,52 @@ inline convert_int8_half_register_to_float(at::vec::Vectorized src) { #endif }} // namespace at::vec::CPU_CAPABILITY +======= +std::pair, Vectorized> inline convert_int8_to_float( + at::vec::Vectorized src) { + auto s8x8 = vld1_s8(src.operator const int8_t*()); + auto s16x8 = vmovl_s8(s8x8); + + auto s32x4_hi = vmovl_s16(vget_high_s16(s16x8)); + auto s32x4_lo = vmovl_s16(vget_low_s16(s16x8)); + + return std::make_pair( + Vectorized(vcvtq_f32_s32(s32x4_lo)), + Vectorized(vcvtq_f32_s32(s32x4_hi))); +} + +std::pair, Vectorized> inline convert_int8_to_float( + at::vec::Vectorized src) { + auto u8x8 = vld1_u8(src.operator const uint8_t*()); + auto u16x8 = vmovl_u8(u8x8); + auto u32x4_hi = vmovl_u16(vget_high_u16(u16x8)); + auto u32x4_lo = vmovl_u16(vget_low_u16(u16x8)); + + return std::make_pair( + Vectorized(vcvtq_f32_u32(u32x4_lo)), + Vectorized(vcvtq_f32_u32(u32x4_hi))); +} + +Vectorized inline convert_int8_half_register_to_float( + at::vec::Vectorized src) { + auto s8x8 = vld1_s8(src.operator const int8_t*()); + auto s16x8 = vmovl_s8(s8x8); + + auto s32x4_lo = vmovl_s16(vget_low_s16(s16x8)); + + return Vectorized(vcvtq_f32_s32(s32x4_lo)); +} + +Vectorized inline convert_int8_half_register_to_float( + at::vec::Vectorized src) { + auto u8x8 = vld1_u8(src.operator const uint8_t*()); + auto u16x8 = vmovl_u8(u8x8); + auto u32x4_lo = vmovl_u16(vget_low_u16(u16x8)); + + return Vectorized(vcvtq_f32_u32(u32x4_lo)); +} + +#endif +} // namespace CPU_CAPABILITY +} // namespace at::vec +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) diff --git a/aten/src/ATen/cpu/vec/vec256/vsx/vec256_bfloat16_vsx.h b/aten/src/ATen/cpu/vec/vec256/vsx/vec256_bfloat16_vsx.h index 2d8afd9ef295..97d4d1f83ca6 100644 --- a/aten/src/ATen/cpu/vec/vec256/vsx/vec256_bfloat16_vsx.h +++ b/aten/src/ATen/cpu/vec/vec256/vsx/vec256_bfloat16_vsx.h @@ -34,7 +34,13 @@ inline Vectorized convert_float_bfloat16( return Vectorized::loadu(arr2); } +<<<<<<< HEAD inline void load_fp32_from_bf16(const c10::BFloat16* data, Vectorized& out) { +======= +inline void load_fp32_from_bf16( + const c10::BFloat16* data, + Vectorized& out) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) __at_align__ float values[Vectorized::size()]; for (const auto k : c10::irange(Vectorized::size())) { values[k] = data[k]; @@ -68,6 +74,10 @@ inline void load_fp32_from_fp16( load_fp32_from_fp16(data, out2); } +<<<<<<< HEAD } // namespace +======= +} // namespace CPU_CAPABILITY +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } // namespace vec } // namespace at diff --git a/aten/src/ATen/cpu/vec/vec256/vsx/vec256_common_vsx.h b/aten/src/ATen/cpu/vec/vec256/vsx/vec256_common_vsx.h index 98ac83963179..04bed3a64b66 100644 --- a/aten/src/ATen/cpu/vec/vec256/vsx/vec256_common_vsx.h +++ b/aten/src/ATen/cpu/vec/vec256/vsx/vec256_common_vsx.h @@ -1,8 +1,13 @@ #pragma once #include +<<<<<<< HEAD #include #include +======= +#include +#include +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // Note: header order is important here #include @@ -14,8 +19,13 @@ #include #include +<<<<<<< HEAD #include #include +======= +#include +#include +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #include @@ -82,8 +92,12 @@ convert_to_int_of_same_size(const Vectorized& src) { template <> Vectorized C10_ALWAYS_INLINE +<<<<<<< HEAD convert_to_int_of_same_size( const Vectorized& src) { +======= +convert_to_int_of_same_size(const Vectorized& src) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return Vectorized{vec_signed(src.vec0()), vec_signed(src.vec1())}; } @@ -91,10 +105,19 @@ template <> inline void convert(const int32_t* src, float* dst, int64_t n) { // int32_t and float have same size int64_t i; +<<<<<<< HEAD for (i = 0; i <= (n - Vectorized::size()); i += Vectorized::size()) { const int32_t* src_a = src + i; float* dst_a = dst + i; vint32 input_vec0 = vec_vsx_ld(offset0, reinterpret_cast(src_a)); +======= + for (i = 0; i <= (n - Vectorized::size()); + i += Vectorized::size()) { + const int32_t* src_a = src + i; + float* dst_a = dst + i; + vint32 input_vec0 = + vec_vsx_ld(offset0, reinterpret_cast(src_a)); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) vint32 input_vec1 = vec_vsx_ld(offset16, reinterpret_cast(src_a)); vfloat32 c0 = vec_float(input_vec0); @@ -111,7 +134,12 @@ inline void convert(const int32_t* src, float* dst, int64_t n) { template <> inline void convert(const int64_t* src, double* dst, int64_t n) { int64_t i; +<<<<<<< HEAD for (i = 0; i <= (n - Vectorized::size()); i += Vectorized::size()) { +======= + for (i = 0; i <= (n - Vectorized::size()); + i += Vectorized::size()) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) const int64_t* src_a = src + i; double* dst_a = dst + i; vint64 input_vec0 = @@ -127,8 +155,13 @@ inline void convert(const int64_t* src, double* dst, int64_t n) { dst[i] = static_cast(src[i]); } } +<<<<<<< HEAD //Generic implementation to fix compiler error //TO-DO : Add optimized version for ppc64 +======= +// Generic implementation to fix compiler error +// TO-DO : Add optimized version for ppc64 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) inline std::tuple, Vectorized> convert_half_float( const Vectorized& a) { constexpr int64_t K = Vectorized::size(); @@ -137,12 +170,22 @@ inline std::tuple, Vectorized> convert_half_float( a.store(arr2); convert(arr2, arr, K); return std::make_tuple( +<<<<<<< HEAD Vectorized::loadu(arr), Vectorized::loadu(arr + Vectorized::size())); } inline Vectorized convert_float_half( const Vectorized& a, const Vectorized& b) { +======= + Vectorized::loadu(arr), + Vectorized::loadu(arr + Vectorized::size())); +} + +inline Vectorized convert_float_half( + const Vectorized& a, + const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) constexpr int64_t K = Vectorized::size(); __at_align__ float arr[K]; __at_align__ Half arr2[K]; @@ -241,6 +284,10 @@ std::pair, Vectorized> inline deinterleave2( Vectorized{aa0123, aa0123_2}, Vectorized{bb0123, bb0123_2}); } +<<<<<<< HEAD } // namespace +======= +} // namespace CPU_CAPABILITY +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } // namespace vec } // namespace at diff --git a/aten/src/ATen/cpu/vec/vec256/vsx/vec256_complex_double_vsx.h b/aten/src/ATen/cpu/vec/vec256/vsx/vec256_complex_double_vsx.h index 2c74847758d8..5a90ffe2f411 100644 --- a/aten/src/ATen/cpu/vec/vec256/vsx/vec256_complex_double_vsx.h +++ b/aten/src/ATen/cpu/vec/vec256/vsx/vec256_complex_double_vsx.h @@ -1,7 +1,12 @@ #pragma once #include +<<<<<<< HEAD #include #include +======= +#include +#include +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #include #include @@ -12,6 +17,12 @@ inline namespace CPU_CAPABILITY { using ComplexDbl = c10::complex; template <> +<<<<<<< HEAD +======= +struct is_vec_specialized_for : std::bool_constant {}; + +template <> +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) class Vectorized { union { struct { @@ -36,8 +47,15 @@ class Vectorized { Vectorized() {} C10_ALWAYS_INLINE Vectorized(vfloat64 v) : _vec0{v}, _vec1{v} {} C10_ALWAYS_INLINE Vectorized(vbool64 vmask) : _vecb0{vmask}, _vecb1{vmask} {} +<<<<<<< HEAD C10_ALWAYS_INLINE Vectorized(vfloat64 v1, vfloat64 v2) : _vec0{v1}, _vec1{v2} {} C10_ALWAYS_INLINE Vectorized(vbool64 v1, vbool64 v2) : _vecb0{v1}, _vecb1{v2} {} +======= + C10_ALWAYS_INLINE Vectorized(vfloat64 v1, vfloat64 v2) + : _vec0{v1}, _vec1{v2} {} + C10_ALWAYS_INLINE Vectorized(vbool64 v1, vbool64 v2) + : _vecb0{v1}, _vecb1{v2} {} +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) Vectorized(ComplexDbl val) { double real_value = val.real(); @@ -58,30 +76,62 @@ class Vectorized { } template +<<<<<<< HEAD static std::enable_if_t> C10_ALWAYS_INLINE blend(const Vectorized& a, const Vectorized& b) { +======= + static std:: + enable_if_t> + C10_ALWAYS_INLINE blend( + const Vectorized& a, + const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return a; } template +<<<<<<< HEAD static std::enable_if_t> C10_ALWAYS_INLINE blend(const Vectorized& a, const Vectorized& b) { +======= + static std:: + enable_if_t> + C10_ALWAYS_INLINE blend( + const Vectorized& a, + const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return b; } template +<<<<<<< HEAD static std::enable_if_t> C10_ALWAYS_INLINE blend(const Vectorized& a, const Vectorized& b) { +======= + static std:: + enable_if_t> + C10_ALWAYS_INLINE blend( + const Vectorized& a, + const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return {b._vec0, a._vec1}; } template +<<<<<<< HEAD static std::enable_if_t> C10_ALWAYS_INLINE blend(const Vectorized& a, const Vectorized& b) { +======= + static std:: + enable_if_t> + C10_ALWAYS_INLINE blend( + const Vectorized& a, + const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return {a._vec0, b._vec1}; } @@ -100,8 +150,13 @@ class Vectorized { const Vectorized& b, const Vectorized& mask) { // convert std::complex index mask to V index mask: xy -> xxyy +<<<<<<< HEAD auto mask_complex = Vectorized(vec_splat(mask._vec0, 0), vec_splat(mask._vec1, 0)); +======= + auto mask_complex = Vectorized( + vec_splat(mask._vec0, 0), vec_splat(mask._vec1, 0)); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return { vec_sel(a._vec0, b._vec0, mask_complex._vecb0), vec_sel(a._vec1, b._vec1, mask_complex._vecb1)}; @@ -210,16 +265,26 @@ class Vectorized { } static Vectorized el_mergee( +<<<<<<< HEAD Vectorized& first, Vectorized& second) { +======= + const Vectorized& first, + const Vectorized& second) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return { vec_mergeh(first._vec0, second._vec0), vec_mergeh(first._vec1, second._vec1)}; } static Vectorized el_mergeo( +<<<<<<< HEAD Vectorized& first, Vectorized& second) { +======= + const Vectorized& first, + const Vectorized& second) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return { vec_mergel(first._vec0, second._vec0), vec_mergel(first._vec1, second._vec1)}; @@ -235,7 +300,13 @@ class Vectorized { Vectorized abs_() const { auto vi = el_mergeo(); auto vr = el_mergee(); +<<<<<<< HEAD return {Sleef_hypotd2_u05vsx(vr._vec0, vi._vec0), Sleef_hypotd2_u05vsx(vr._vec1, vi._vec1)}; +======= + return { + Sleef_hypotd2_u05vsx(vr._vec0, vi._vec0), + Sleef_hypotd2_u05vsx(vr._vec1, vi._vec1)}; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } Vectorized abs() const { @@ -394,8 +465,13 @@ class Vectorized { Vectorized& second) { // Operates on individual floats, see _mm_hadd_ps // {f0+f1, s0+s1, f2+f3, s2+s3, ...} +<<<<<<< HEAD // i.e. it sums the re and im of each value and interleaves first and second: // {f_re0 + f_im0, s_re0 + s_im0, f_re1 + f_im1, s_re1 + s_im1, ...} +======= + // i.e. it sums the re and im of each value and interleaves first and + // second: {f_re0 + f_im0, s_re0 + s_im0, f_re1 + f_im1, s_re1 + s_im1, ...} +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return el_mergee(first, second) + el_mergeo(first, second); } @@ -413,7 +489,12 @@ class Vectorized { return el_mergee(first_ret, second_ret); // 2 mergee's } +<<<<<<< HEAD Vectorized inline operator*(const Vectorized& b) const { +======= + Vectorized inline operator*( + const Vectorized& b) const { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) //(a + bi) * (c + di) = (ac - bd) + (ad + bc)i #if 1 // this is more vsx friendly than simulating horizontal from x86 @@ -422,7 +503,11 @@ class Vectorized { vi = vi ^ vd_rsign_mask; auto ret = elwise_mult(vr); auto vx_swapped = el_swapped(); +<<<<<<< HEAD ret = vx_swapped.el_madd(vi, ret); +======= + ret = vx_swapped.elwise_mult(vi) + ret; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #else auto ac_bd = elwise_mult(b); auto d_c = b.el_swapped(); @@ -433,6 +518,7 @@ class Vectorized { return ret; } +<<<<<<< HEAD Vectorized inline operator/(const Vectorized& b) const { // re + im*i = (a + bi) / (c + di) // re = (ac + bd)/abs_2() @@ -452,6 +538,41 @@ class Vectorized { auto denom2 = b2.abs_2_(); // (c^2+d^2)/sc^2 (c^2+d^2)/sc^2 ret = ret.elwise_div(denom2); return ret; +======= + Vectorized inline operator/( + const Vectorized& b) const { + // re + im*i = (a + bi) / (c + di) + // re = (ac + bd)/abs_2() + // im = (bc - ad)/abs_2() + // auto fabs_cd = Vectorized{ + // vec_andc(b._vec0, vd_sign_mask), + // vec_andc(b._vec1, vd_sign_mask)}; // |c| |d| + // auto fabs_dc = fabs_cd.el_swapped(); // |d| |c| + // auto scale = fabs_cd.elwise_max(fabs_dc); // sc = max(|c|, |d|) + // auto a2 = elwise_div(scale); // a/sc b/sc + // auto b2 = b.elwise_div(scale); // c/sc d/sc + // auto acbd2 = a2.elwise_mult(b2); // ac/sc^2 bd/sc^2 + // auto dc2 = b2.el_swapped(); // d/sc c/sc + // dc2 = dc2 ^ vd_rsign_mask; // -d/sc c/sc + // auto adbc2 = a2.elwise_mult(dc2); // -ad/sc^2 bc/sc^2 + // auto ret = horizontal_add(acbd2, adbc2); // (ac+bd)/sc^2 (bc-ad)/sc^2 + // auto denom2 = b2.abs_2_(); // (c^2+d^2)/sc^2 + // (c^2+d^2)/sc^2 ret = ret.elwise_div(denom2); return ret; + + __at_align__ c10::complex + tmp1[Vectorized>::size()]; + __at_align__ c10::complex + tmp2[Vectorized>::size()]; + __at_align__ c10::complex + out[Vectorized>::size()]; + this->store(tmp1); + b.store(tmp2); + + for (const auto i : c10::irange(Vectorized>::size())) { + out[i] = tmp1[i] / tmp2[i]; + } + return loadu(out); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } Vectorized exp() const { @@ -493,6 +614,7 @@ class Vectorized { } Vectorized eq(const Vectorized& other) const { +<<<<<<< HEAD auto eq = (*this == other); // compares real and imag individually // If both real numbers and imag numbers are equal, then the complex numbers are equal return (eq.real() & eq.imag()) & vd_one; @@ -500,6 +622,17 @@ class Vectorized { Vectorized ne(const Vectorized& other) const { auto ne = (*this != other); // compares real and imag individually // If either real numbers or imag numbers are not equal, then the complex numbers are not equal +======= + auto eq = (*this == other); // compares real and imag individually + // If both real numbers and imag numbers are equal, then the complex numbers + // are equal + return (eq.real() & eq.imag()) & vd_one; + } + Vectorized ne(const Vectorized& other) const { + auto ne = (*this != other); // compares real and imag individually + // If either real numbers or imag numbers are not equal, then the complex + // numbers are not equal +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return (ne.real() | ne.imag()) & vd_one; } @@ -555,6 +688,7 @@ Vectorized inline minimum( } template <> +<<<<<<< HEAD Vectorized C10_ALWAYS_INLINE operator+(const Vectorized& a, const Vectorized& b) { return Vectorized{vec_add(a.vec0(), b.vec0()), vec_add(a.vec1(), b.vec1())}; } @@ -580,5 +714,95 @@ Vectorized C10_ALWAYS_INLINE operator^(const Vectorized& } } // namespace +======= +Vectorized C10_ALWAYS_INLINE +operator+(const Vectorized& a, const Vectorized& b) { + return Vectorized{ + vec_add(a.vec0(), b.vec0()), vec_add(a.vec1(), b.vec1())}; +} + +template <> +Vectorized C10_ALWAYS_INLINE +operator-(const Vectorized& a, const Vectorized& b) { + return Vectorized{ + vec_sub(a.vec0(), b.vec0()), vec_sub(a.vec1(), b.vec1())}; +} + +template <> +Vectorized C10_ALWAYS_INLINE +operator&(const Vectorized& a, const Vectorized& b) { + return Vectorized{ + vec_and(a.vec0(), b.vec0()), vec_and(a.vec1(), b.vec1())}; +} + +template <> +Vectorized C10_ALWAYS_INLINE +operator|(const Vectorized& a, const Vectorized& b) { + return Vectorized{ + vec_or(a.vec0(), b.vec0()), vec_or(a.vec1(), b.vec1())}; +} + +template <> +Vectorized C10_ALWAYS_INLINE +operator^(const Vectorized& a, const Vectorized& b) { + return Vectorized{ + vec_xor(a.vec0(), b.vec0()), vec_xor(a.vec1(), b.vec1())}; +} + +template <> +Vectorized C10_ALWAYS_INLINE +operator*(const Vectorized& a, const Vectorized& b) { + // (a + ib) * (c + id) = (ac - bd) + i(ad + bc) + // Split into real and imaginary parts + auto a_real = a.el_mergee(); // real part of a + auto a_imag = a.el_mergeo(); // imag part of a + auto b_real = b.el_mergee(); // real part of b + auto b_imag = b.el_mergeo(); // imag part of b + + // Compute components + auto ac = a_real.elwise_mult(b_real); // real*real + auto bd = a_imag.elwise_mult(b_imag); // imag*imag + + // Real part: ac - bd + auto real = ac - bd; + + auto ad = a_real.elwise_mult(b_imag); // real*imag + auto bc = a_imag.elwise_mult(b_real); // imag*real + + // Imag = ad + bc + auto imag = ad + bc; + + // Merge real and imaginary parts into vectors + __vector double v0 = vec_mergeh(real.vec0(), imag.vec0()); // [r0, i0] + __vector double v1 = vec_mergeh(real.vec1(), imag.vec1()); // [r1, i1] + + // Create the final result + auto result = Vectorized{v0, v1}; + return result; +} + +template <> +Vectorized C10_ALWAYS_INLINE +operator/(const Vectorized& a, const Vectorized& b) { + // re + im*i = (a + bi) / (c + di) + // re = (ac + bd)/abs_2() + // im = (bc - ad)/abs_2() + // Take absolute values of real and imaginary parts of b + __at_align__ c10::complex + tmp1[Vectorized>::size()]; + __at_align__ c10::complex + tmp2[Vectorized>::size()]; + __at_align__ c10::complex + out[Vectorized>::size()]; + a.store(tmp1); + b.store(tmp2); + for (const auto i : c10::irange(Vectorized>::size())) { + out[i] = tmp1[i] / tmp2[i]; + } + return Vectorized::loadu(out); +} + +} // namespace CPU_CAPABILITY +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } // namespace vec } // namespace at diff --git a/aten/src/ATen/cpu/vec/vec256/vsx/vec256_complex_float_vsx.h b/aten/src/ATen/cpu/vec/vec256/vsx/vec256_complex_float_vsx.h index 58fdd34b18d8..65582fcc1a2e 100644 --- a/aten/src/ATen/cpu/vec/vec256/vsx/vec256_complex_float_vsx.h +++ b/aten/src/ATen/cpu/vec/vec256/vsx/vec256_complex_float_vsx.h @@ -1,8 +1,13 @@ #pragma once #include +<<<<<<< HEAD #include #include +======= +#include +#include +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #include #include @@ -13,6 +18,12 @@ inline namespace CPU_CAPABILITY { using ComplexFlt = c10::complex; template <> +<<<<<<< HEAD +======= +struct is_vec_specialized_for : std::bool_constant {}; + +template <> +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) class Vectorized { private: union { @@ -40,8 +51,15 @@ class Vectorized { C10_ALWAYS_INLINE Vectorized(vfloat32 v) : _vec0{v}, _vec1{v} {} C10_ALWAYS_INLINE Vectorized(vbool32 vmask) : _vecb0{vmask}, _vecb1{vmask} {} +<<<<<<< HEAD C10_ALWAYS_INLINE Vectorized(vfloat32 v1, vfloat32 v2) : _vec0{v1}, _vec1{v2} {} C10_ALWAYS_INLINE Vectorized(vbool32 v1, vbool32 v2) : _vecb0{v1}, _vecb1{v2} {} +======= + C10_ALWAYS_INLINE Vectorized(vfloat32 v1, vfloat32 v2) + : _vec0{v1}, _vec1{v2} {} + C10_ALWAYS_INLINE Vectorized(vbool32 v1, vbool32 v2) + : _vecb0{v1}, _vecb1{v2} {} +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) Vectorized(ComplexFlt val) { float real_value = val.real(); @@ -50,7 +68,15 @@ class Vectorized { _vec1 = vfloat32{real_value, imag_value, real_value, imag_value}; } +<<<<<<< HEAD Vectorized(ComplexFlt val1, ComplexFlt val2, ComplexFlt val3, ComplexFlt val4) { +======= + Vectorized( + ComplexFlt val1, + ComplexFlt val2, + ComplexFlt val3, + ComplexFlt val4) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) _vec0 = vfloat32{val1.real(), val1.imag(), val2.real(), val2.imag()}; _vec1 = vfloat32{val3.real(), val3.imag(), val4.real(), val4.imag()}; } @@ -153,8 +179,15 @@ class Vectorized { auto mask_complex = Vectorized( vec_mergeh(mask._vec0, mask._vec0), vec_mergeh(mask._vec1, mask._vec1)); return { +<<<<<<< HEAD vec_sel(a._vec0, b._vec0, reinterpret_cast(mask_complex._vec0)), vec_sel(a._vec1, b._vec1, reinterpret_cast(mask_complex._vec1)), +======= + vec_sel( + a._vec0, b._vec0, reinterpret_cast(mask_complex._vec0)), + vec_sel( + a._vec1, b._vec1, reinterpret_cast(mask_complex._vec1)), +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) }; } @@ -250,8 +283,13 @@ class Vectorized { Vectorized& second) { // Operates on individual floats, see _mm_hadd_ps // {f0+f1, s0+s1, f2+f3, s2+s3, ...} +<<<<<<< HEAD // i.e. it sums the re and im of each value and interleaves first and second: // {f_re0 + f_im0, s_re0 + s_im0, f_re1 + f_im1, s_re1 + s_im1, ...} +======= + // i.e. it sums the re and im of each value and interleaves first and + // second: {f_re0 + f_im0, s_re0 + s_im0, f_re1 + f_im1, s_re1 + s_im1, ...} +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return el_mergee(first, second) + el_mergeo(first, second); } @@ -279,7 +317,13 @@ class Vectorized { Vectorized abs_() const { auto vi = el_mergeo(); auto vr = el_mergee(); +<<<<<<< HEAD return {Sleef_hypotf4_u05vsx(vr._vec0, vi._vec0), Sleef_hypotf4_u05vsx(vr._vec1, vi._vec1)}; +======= + return { + Sleef_hypotf4_u05vsx(vr._vec0, vi._vec0), + Sleef_hypotf4_u05vsx(vr._vec1, vi._vec1)}; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } Vectorized abs() const { @@ -354,16 +398,26 @@ class Vectorized { } static Vectorized el_mergee( +<<<<<<< HEAD Vectorized& first, Vectorized& second) { +======= + const Vectorized& first, + const Vectorized& second) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return { vec_mergee(first._vecb0, second._vecb0), vec_mergee(first._vecb1, second._vecb1)}; } static Vectorized el_mergeo( +<<<<<<< HEAD Vectorized& first, Vectorized& second) { +======= + const Vectorized& first, + const Vectorized& second) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return { vec_mergeo(first._vecb0, second._vecb0), vec_mergeo(first._vecb1, second._vecb1)}; @@ -469,7 +523,12 @@ class Vectorized { return Vectorized(pi_2) - asin(); } +<<<<<<< HEAD Vectorized inline operator*(const Vectorized& b) const { +======= + Vectorized inline operator*( + const Vectorized& b) const { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) //(a + bi) * (c + di) = (ac - bd) + (ad + bc)i #if 1 @@ -480,7 +539,11 @@ class Vectorized { vi = vi ^ rsign_mask; auto ret = elwise_mult(vr); auto vx_swapped = el_swapped(); +<<<<<<< HEAD ret = vx_swapped.el_madd(vi, ret); +======= + ret = vx_swapped.elwise_mult(vi) + ret; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return ret; #else @@ -494,6 +557,7 @@ class Vectorized { #endif } +<<<<<<< HEAD Vectorized inline operator/(const Vectorized& b) const { // re + im*i = (a + bi) / (c + di) // re = (ac + bd)/abs_2() @@ -513,6 +577,40 @@ class Vectorized { auto denom2 = b2.abs_2_(); // (c^2+d^2)/sc^2 (c^2+d^2)/sc^2 ret = ret.elwise_div(denom2); return ret; +======= + Vectorized inline operator/( + const Vectorized& b) const { +#if 1 + __at_align__ c10::complex + tmp1[Vectorized>::size()]; + __at_align__ c10::complex + tmp2[Vectorized>::size()]; + __at_align__ c10::complex + out[Vectorized>::size()]; + this->store(tmp1); + b.store(tmp2); + + for (const auto i : c10::irange(Vectorized>::size())) { + out[i] = tmp1[i] / tmp2[i]; + } + return loadu(out); +#else + auto fabs_cd = Vectorized{ + vec_andc(b._vec0, sign_mask), vec_andc(b._vec1, sign_mask)}; // |c| |d| + auto fabs_dc = fabs_cd.el_swapped(); // |d| |c| + auto scale = fabs_cd.elwise_max(fabs_dc); // sc = max(|c|, |d|) + auto a2 = elwise_div(scale); // a/sc b/sc + auto b2 = b.elwise_div(scale); // c/sc d/sc + auto acbd2 = a2.elwise_mult(b2); // ac/sc^2 bd/s + auto dc2 = b2.el_swapped(); // d/sc c/sc + dc2 = dc2 ^ rsign_mask; // -d/sc c/sc + auto adbc2 = a2.elwise_mult(dc2); // -ad/sc^2 bc/sc^2 + auto ret = horizontal_add(acbd2, adbc2); // (ac+bd)/sc^2 (bc-ad)/sc^2 + auto denom2 = b2.abs_2_(); // (c^2+d^2)/sc^2 (c^2+d^2)/sc^2 + ret = ret.elwise_div(denom2); + return ret; +#endif +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } Vectorized asin() const { @@ -549,6 +647,7 @@ class Vectorized { } Vectorized eq(const Vectorized& other) const { +<<<<<<< HEAD auto eq = (*this == other); // compares real and imag individually // If both real numbers and imag numbers are equal, then the complex numbers are equal return (eq.real() & eq.imag()) & one; @@ -556,6 +655,17 @@ class Vectorized { Vectorized ne(const Vectorized& other) const { auto ne = (*this != other); // compares real and imag individually // If either real numbers or imag numbers are not equal, then the complex numbers are not equal +======= + auto eq = (*this == other); // compares real and imag individually + // If both real numbers and imag numbers are equal, then the complex numbers + // are equal + return (eq.real() & eq.imag()) & one; + } + Vectorized ne(const Vectorized& other) const { + auto ne = (*this != other); // compares real and imag individually + // If either real numbers or imag numbers are not equal, then the complex + // numbers are not equal +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return (ne.real() | ne.imag()) & one; } @@ -631,6 +741,7 @@ Vectorized inline minimum( } template <> +<<<<<<< HEAD Vectorized C10_ALWAYS_INLINE operator+(const Vectorized& a, const Vectorized& b) { return Vectorized{vec_add(a.vec0(), b.vec0()), vec_add(a.vec1(), b.vec1())}; } @@ -656,5 +767,114 @@ Vectorized C10_ALWAYS_INLINE operator^(const Vectorized& } } // namespace +======= +Vectorized C10_ALWAYS_INLINE +operator+(const Vectorized& a, const Vectorized& b) { + return Vectorized{ + vec_add(a.vec0(), b.vec0()), vec_add(a.vec1(), b.vec1())}; +} + +template <> +Vectorized C10_ALWAYS_INLINE +operator-(const Vectorized& a, const Vectorized& b) { + return Vectorized{ + vec_sub(a.vec0(), b.vec0()), vec_sub(a.vec1(), b.vec1())}; +} + +template <> +Vectorized C10_ALWAYS_INLINE +operator&(const Vectorized& a, const Vectorized& b) { + return Vectorized{ + vec_and(a.vec0(), b.vec0()), vec_and(a.vec1(), b.vec1())}; +} + +template <> +Vectorized C10_ALWAYS_INLINE +operator|(const Vectorized& a, const Vectorized& b) { + return Vectorized{ + vec_or(a.vec0(), b.vec0()), vec_or(a.vec1(), b.vec1())}; +} + +template <> +Vectorized C10_ALWAYS_INLINE +operator^(const Vectorized& a, const Vectorized& b) { + return Vectorized{ + vec_xor(a.vec0(), b.vec0()), vec_xor(a.vec1(), b.vec1())}; +} + +template <> +Vectorized C10_ALWAYS_INLINE +operator*(const Vectorized& a, const Vectorized& b) { + // (a + ib) * (c + id) = (ac - bd) + i(ad + bc) + // Split into real and imaginary parts + auto a_real = a.el_mergee(); // real part of a + auto a_imag = a.el_mergeo(); // imag part of a + auto b_real = b.el_mergee(); // real part of b + auto b_imag = b.el_mergeo(); // imag part of b + + auto b_imag_neg = b_imag ^ rsign_mask; + // Compute components + auto ac = a_real.elwise_mult(b_real); // real * real + auto bd = a_imag.elwise_mult(b_imag_neg); // imag * imag + auto ad = a_real.elwise_mult(b_imag); // real * imag + auto bc = a_imag.elwise_mult(b_real); // imag * real + + // Real = ac - bd (fix the negative bd part) + auto real = ac + bd; // Real part calculation + auto imag = ad + bc; // Imaginary part calculation + + // Step 1: Extract from real and imag + __vector float r0 = real.vec0(); // {r0, r1, r2, r3} + __vector float i0 = imag.vec0(); // {i0, i1, i2, i3} + + __vector float r1 = real.vec1(); // imag[0..3] + __vector float i1 = imag.vec1(); // imag[4..7] + + __vector unsigned char perm_lo = { + 0, + 1, + 2, + 3, // r0 + 16, + 17, + 18, + 19, // + 8, + 9, + 10, + 11, // r1 + 24, + 25, + 26, + 27}; + __vector float v0 = + vec_perm(r0, i0, perm_lo); // Interleave r0 and i0, r1 and i1 + __vector float v1 = vec_perm(r1, i1, perm_lo); + Vectorized result(v0, v1); + return result; +} + +template <> +Vectorized C10_ALWAYS_INLINE +operator/(const Vectorized& a, const Vectorized& b) { + // Take absolute values of real and imaginary parts of b + __at_align__ c10::complex + tmp1[Vectorized>::size()]; + __at_align__ c10::complex + tmp2[Vectorized>::size()]; + __at_align__ c10::complex out[Vectorized>::size()]; + a.store(tmp1); + b.store(tmp2); + for (const auto i : + c10::irange(Vectorized>:: + size())) { //{Vectorized>::size())) + //{ + out[i] = tmp1[i] / tmp2[i]; + } + return Vectorized::loadu(out); +} + +} // namespace CPU_CAPABILITY +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } // namespace vec } // namespace at diff --git a/aten/src/ATen/cpu/vec/vec256/vsx/vec256_double_vsx.h b/aten/src/ATen/cpu/vec/vec256/vsx/vec256_double_vsx.h index ff10618611f9..7fdeb12c2f19 100644 --- a/aten/src/ATen/cpu/vec/vec256/vsx/vec256_double_vsx.h +++ b/aten/src/ATen/cpu/vec/vec256/vsx/vec256_double_vsx.h @@ -1,8 +1,13 @@ #pragma once #include +<<<<<<< HEAD #include #include +======= +#include +#include +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #include #include @@ -12,6 +17,11 @@ namespace vec { inline namespace CPU_CAPABILITY { +<<<<<<< HEAD +======= +template <> +struct is_vec_specialized_for : std::bool_constant {}; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) template <> class Vectorized { @@ -39,8 +49,15 @@ class Vectorized { Vectorized() {} C10_ALWAYS_INLINE Vectorized(vfloat64 v) : _vec0{v}, _vec1{v} {} C10_ALWAYS_INLINE Vectorized(vbool64 vmask) : _vecb0{vmask}, _vecb1{vmask} {} +<<<<<<< HEAD C10_ALWAYS_INLINE Vectorized(vfloat64 v1, vfloat64 v2) : _vec0{v1}, _vec1{v2} {} C10_ALWAYS_INLINE Vectorized(vbool64 v1, vbool64 v2) : _vecb0{v1}, _vecb1{v2} {} +======= + C10_ALWAYS_INLINE Vectorized(vfloat64 v1, vfloat64 v2) + : _vec0{v1}, _vec1{v2} {} + C10_ALWAYS_INLINE Vectorized(vbool64 v1, vbool64 v2) + : _vecb0{v1}, _vecb1{v2} {} +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) C10_ALWAYS_INLINE Vectorized(double scalar) : _vec0{vec_splats(scalar)}, _vec1{vec_splats(scalar)} {} C10_ALWAYS_INLINE Vectorized( @@ -63,6 +80,7 @@ class Vectorized { } template +<<<<<<< HEAD static std::enable_if_t> C10_ALWAYS_INLINE blend(const Vectorized& a, const Vectorized& b) { return a; @@ -120,10 +138,72 @@ class Vectorized { // generated masks return { b._vec0, (vfloat64)vec_sel(a._vec1, b._vec1, mask_2nd) }; +======= + static std::enable_if_t> + C10_ALWAYS_INLINE + blend(const Vectorized& a, const Vectorized& b) { + return a; + } + + template + static std::enable_if_t> + C10_ALWAYS_INLINE + blend(const Vectorized& a, const Vectorized& b) { + return b; + } + + template + static std::enable_if_t> + C10_ALWAYS_INLINE + blend(const Vectorized& a, const Vectorized& b) { + return {b._vec0, a._vec1}; + } + + template + static std::enable_if_t> + C10_ALWAYS_INLINE + blend(const Vectorized& a, const Vectorized& b) { + return {a._vec0, b._vec1}; + } + + template + static std::enable_if_t> + C10_ALWAYS_INLINE + blend(const Vectorized& a, const Vectorized& b) { + const vbool64 mask_1st = VsxDblMask1(mask); + return {(vfloat64)vec_sel(a._vec0, b._vec0, mask_1st), a._vec1}; + } + + template + static std::enable_if_t> + C10_ALWAYS_INLINE + blend(const Vectorized& a, const Vectorized& b) { + const vbool64 mask_1st = VsxDblMask1(mask); + return {(vfloat64)vec_sel(a._vec0, b._vec0, mask_1st), b._vec1}; + } + + template + static std::enable_if_t> + C10_ALWAYS_INLINE + blend(const Vectorized& a, const Vectorized& b) { + const vbool64 mask_2nd = VsxDblMask2(mask); + // generated masks + return {a._vec0, (vfloat64)vec_sel(a._vec1, b._vec1, mask_2nd)}; + } + + template + static std::enable_if_t> + C10_ALWAYS_INLINE + blend(const Vectorized& a, const Vectorized& b) { + const vbool64 mask_2nd = VsxDblMask2(mask); + // generated masks + return {b._vec0, (vfloat64)vec_sel(a._vec1, b._vec1, mask_2nd)}; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } template static std::enable_if_t> +<<<<<<< HEAD C10_ALWAYS_INLINE blend(const Vectorized& a, const Vectorized& b) { const vbool64 mask_1st = VsxDblMask1(mask); const vbool64 mask_2nd = VsxDblMask2(mask); @@ -133,6 +213,17 @@ class Vectorized { } +======= + C10_ALWAYS_INLINE + blend(const Vectorized& a, const Vectorized& b) { + const vbool64 mask_1st = VsxDblMask1(mask); + const vbool64 mask_2nd = VsxDblMask2(mask); + return { + (vfloat64)vec_sel(a._vec0, b._vec0, mask_1st), + (vfloat64)vec_sel(a._vec1, b._vec1, mask_2nd)}; + } + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) static Vectorized C10_ALWAYS_INLINE blendv( const Vectorized& a, const Vectorized& b, @@ -144,12 +235,26 @@ class Vectorized { vec_sel(a._vec1, b._vec1, mask._vecb1)}; } template +<<<<<<< HEAD static Vectorized arange(double base = 0., step_t step = static_cast(1)) { return Vectorized(base, base + step, base + 2 * step, base + 3 * step); } static Vectorized C10_ALWAYS_INLINE set(const Vectorized& a, const Vectorized& b, size_t count = size()) { +======= + static Vectorized arange( + double base = 0., + step_t step = static_cast(1)) { + return Vectorized( + base, base + step, base + 2 * step, base + 3 * step); + } + + static Vectorized C10_ALWAYS_INLINE + set(const Vectorized& a, + const Vectorized& b, + size_t count = size()) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) switch (count) { case 0: return a; @@ -192,15 +297,24 @@ class Vectorized { double& operator[](int idx) = delete; Vectorized map(double (*const f)(double)) const { Vectorized ret; +<<<<<<< HEAD for (const auto i : c10::irange(size()/2)) { ret._vec0[i] = f(_vec0[i]); } for (const auto i : c10::irange(size()/2)) { ret._vec1[i] = f(_vec1[i]); +======= + for (const auto i : c10::irange(size() / 2)) { + ret._vec0[i] = f(_vec0[i]); + } + for (const auto i : c10::irange(size() / 2)) { + ret._vec1[i] = f(_vec1[i]); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } return ret; } +<<<<<<< HEAD Vectorized mapbi(double (*const f)(double, double), const Vectorized& other) const { Vectorized ret; @@ -209,6 +323,17 @@ class Vectorized { } for (const auto i : c10::irange(size()/2)) { ret._vec1[i] = f(_vec1[i], other._vec1[i]); +======= + Vectorized mapbi( + double (*const f)(double, double), + const Vectorized& other) const { + Vectorized ret; + for (const auto i : c10::irange(size() / 2)) { + ret._vec0[i] = f(_vec0[i], other._vec0[i]); + } + for (const auto i : c10::irange(size() / 2)) { + ret._vec1[i] = f(_vec1[i], other._vec1[i]); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } return ret; } @@ -217,6 +342,7 @@ class Vectorized { } Vectorized C10_ALWAYS_INLINE acos() const { +<<<<<<< HEAD return {Sleef_acosd2_u10(_vec0), Sleef_acosd2_u10(_vec1)}; } Vectorized C10_ALWAYS_INLINE acosh() const { @@ -248,11 +374,48 @@ class Vectorized { } Vectorized C10_ALWAYS_INLINE exp() const { return {Sleef_expd2_u10(_vec0), Sleef_expd2_u10(_vec1)}; +======= + return {Sleef_acosd2_u10(_vec0), Sleef_acosd2_u10(_vec1)}; + } + Vectorized C10_ALWAYS_INLINE acosh() const { + return {Sleef_acoshd2_u10(_vec0), Sleef_acoshd2_u10(_vec1)}; + } + Vectorized C10_ALWAYS_INLINE asin() const { + return {Sleef_asind2_u10(_vec0), Sleef_asind2_u10(_vec1)}; + } + Vectorized C10_ALWAYS_INLINE asinh() const { + return {Sleef_asinhd2_u10(_vec0), Sleef_asinhd2_u10(_vec1)}; + } + Vectorized atan() const { + return {Sleef_atand2_u10(_vec0), Sleef_atand2_u10(_vec1)}; + } + Vectorized atanh() const { + return {Sleef_atanhd2_u10(_vec0), Sleef_atanhd2_u10(_vec1)}; + } + Vectorized atan2(const Vectorized& b) const { + return { + Sleef_atan2d2_u10(_vec0, b._vec0), Sleef_atan2d2_u10(_vec1, b._vec1)}; + } + Vectorized copysign(const Vectorized& sign) const { + return { + Sleef_copysignd2(_vec0, sign._vec0), + Sleef_copysignd2(_vec1, sign._vec1)}; + } + Vectorized erf() const { + return {Sleef_erfd2_u10(_vec0), Sleef_erfd2_u10(_vec1)}; + } + Vectorized erfc() const { + return {Sleef_erfcd2_u15(_vec0), Sleef_erfcd2_u15(_vec1)}; + } + Vectorized C10_ALWAYS_INLINE exp() const { + return {Sleef_expd2_u10(_vec0), Sleef_expd2_u10(_vec1)}; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } Vectorized C10_ALWAYS_INLINE exp2() const { return {Sleef_exp2d2_u10(_vec0), Sleef_exp2d2_u10(_vec1)}; } Vectorized expm1() const { +<<<<<<< HEAD return {Sleef_expm1d2_u10(_vec0), Sleef_expm1d2_u10(_vec1)}; } Vectorized C10_ALWAYS_INLINE exp_u20() const { @@ -261,6 +424,16 @@ class Vectorized { Vectorized lgamma() const __ubsan_ignore_undefined__ { return {Sleef_lgammad2_u10(_vec0), Sleef_lgammad2_u10(_vec1)}; +======= + return {Sleef_expm1d2_u10(_vec0), Sleef_expm1d2_u10(_vec1)}; + } + Vectorized C10_ALWAYS_INLINE exp_u20() const { + return exp(); + } + + Vectorized lgamma() const __ubsan_ignore_undefined__ { + return {Sleef_lgammad2_u10(_vec0), Sleef_lgammad2_u10(_vec1)}; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } Vectorized erfinv() const { @@ -269,7 +442,13 @@ class Vectorized { Vectorized angle() const { auto tmp = blendv( +<<<<<<< HEAD Vectorized(0), Vectorized(c10::pi), *this < Vectorized(0)); +======= + Vectorized(0), + Vectorized(c10::pi), + *this < Vectorized(0)); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return blendv(tmp, *this, isnan()); } Vectorized real() const { @@ -283,6 +462,7 @@ class Vectorized { } Vectorized C10_ALWAYS_INLINE log() const { +<<<<<<< HEAD return {Sleef_logd2_u10(_vec0), Sleef_logd2_u10(_vec1)}; } Vectorized C10_ALWAYS_INLINE log10() const { @@ -293,15 +473,34 @@ class Vectorized { } Vectorized C10_ALWAYS_INLINE log2() const { return {Sleef_log2d2_u10(_vec0), Sleef_log2d2_u10(_vec1)}; +======= + return {Sleef_logd2_u10(_vec0), Sleef_logd2_u10(_vec1)}; + } + Vectorized C10_ALWAYS_INLINE log10() const { + return {Sleef_log10d2_u10(_vec0), Sleef_log10d2_u10(_vec1)}; + } + Vectorized C10_ALWAYS_INLINE log1p() const { + return {Sleef_log1pd2_u10(_vec0), Sleef_log1pd2_u10(_vec1)}; + } + Vectorized C10_ALWAYS_INLINE log2() const { + return {Sleef_log2d2_u10(_vec0), Sleef_log2d2_u10(_vec1)}; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } Vectorized C10_ALWAYS_INLINE ceil() const { return {vec_ceil(_vec0), vec_ceil(_vec1)}; } Vectorized C10_ALWAYS_INLINE cos() const { +<<<<<<< HEAD return {Sleef_cosd2_u10(_vec0), Sleef_cosd2_u10(_vec1)}; } Vectorized C10_ALWAYS_INLINE cosh() const { return {Sleef_coshd2_u10(_vec0), Sleef_coshd2_u10(_vec1)}; +======= + return {Sleef_cosd2_u10(_vec0), Sleef_cosd2_u10(_vec1)}; + } + Vectorized C10_ALWAYS_INLINE cosh() const { + return {Sleef_coshd2_u10(_vec0), Sleef_coshd2_u10(_vec1)}; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } Vectorized C10_ALWAYS_INLINE floor() const { return {vec_floor(_vec0), vec_floor(_vec1)}; @@ -313,6 +512,7 @@ class Vectorized { return {vec_rint(_vec0), vec_rint(_vec1)}; } Vectorized C10_ALWAYS_INLINE sin() const { +<<<<<<< HEAD return {Sleef_sind2_u10(_vec0), Sleef_sind2_u10(_vec1)}; } Vectorized C10_ALWAYS_INLINE sinh() const { @@ -323,6 +523,18 @@ class Vectorized { } Vectorized C10_ALWAYS_INLINE tanh() const { return {Sleef_tanhd2_u10(_vec0), Sleef_tanhd2_u10(_vec1)}; +======= + return {Sleef_sind2_u10(_vec0), Sleef_sind2_u10(_vec1)}; + } + Vectorized C10_ALWAYS_INLINE sinh() const { + return {Sleef_sinhd2_u10(_vec0), Sleef_sinhd2_u10(_vec1)}; + } + Vectorized C10_ALWAYS_INLINE tan() const { + return {Sleef_tand2_u10(_vec0), Sleef_tand2_u10(_vec1)}; + } + Vectorized C10_ALWAYS_INLINE tanh() const { + return {Sleef_tanhd2_u10(_vec0), Sleef_tanhd2_u10(_vec1)}; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } Vectorized C10_ALWAYS_INLINE trunc() const { return {vec_trunc(_vec0), vec_trunc(_vec1)}; @@ -345,6 +557,7 @@ class Vectorized { } Vectorized C10_ALWAYS_INLINE pow(const Vectorized& b) const { +<<<<<<< HEAD return {Sleef_powd2_u10(_vec0, b._vec0), Sleef_powd2_u10(_vec1, b._vec1)}; } Vectorized C10_ALWAYS_INLINE fmod(const Vectorized& b) const { @@ -357,6 +570,22 @@ class Vectorized { Vectorized nextafter(const Vectorized& b) const { return {Sleef_nextafterd2(_vec0, b._vec0), Sleef_nextafterd2(_vec1, b._vec1)}; +======= + return {Sleef_powd2_u10(_vec0, b._vec0), Sleef_powd2_u10(_vec1, b._vec1)}; + } + Vectorized C10_ALWAYS_INLINE fmod(const Vectorized& b) const { + return {Sleef_fmodd2(_vec0, b._vec0), Sleef_fmodd2(_vec1, b._vec1)}; + } + + Vectorized hypot(const Vectorized& b) const { + return { + Sleef_hypotd2_u05(_vec0, b._vec0), Sleef_hypotd2_u05(_vec1, b._vec1)}; + } + + Vectorized nextafter(const Vectorized& b) const { + return { + Sleef_nextafterd2(_vec0, b._vec0), Sleef_nextafterd2(_vec1, b._vec1)}; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } Vectorized igamma(const Vectorized& x) const { @@ -367,7 +596,10 @@ class Vectorized { return mapbi(calc_igammac, x); } +<<<<<<< HEAD +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) Vectorized i0() const { return map(calc_i0); } @@ -390,6 +622,7 @@ class Vectorized { return ret._nor(); } bool has_inf_nan() const { +<<<<<<< HEAD for (const auto i : c10::irange(size()/2)) { if(_isnan(_vec0[i]) || _isinf(_vec0[i])) { return true; @@ -397,6 +630,15 @@ class Vectorized { } for (const auto i : c10::irange(size()/2)) { if(_isnan(_vec1[i]) || _isinf(_vec1[i])) { +======= + for (const auto i : c10::irange(size() / 2)) { + if (_isnan(_vec0[i]) || _isinf(_vec0[i])) { + return true; + } + } + for (const auto i : c10::irange(size() / 2)) { + if (_isnan(_vec1[i]) || _isinf(_vec1[i])) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return true; } } @@ -441,6 +683,7 @@ Vectorized inline minimum( } template <> +<<<<<<< HEAD Vectorized C10_ALWAYS_INLINE operator+(const Vectorized& a, const Vectorized& b) { return Vectorized{vec_add(a.vec0(), b.vec0()), vec_add(a.vec1(), b.vec1())}; } @@ -476,5 +719,56 @@ Vectorized C10_ALWAYS_INLINE operator^(const Vectorized& a, cons } } // namespace +======= +Vectorized C10_ALWAYS_INLINE +operator+(const Vectorized& a, const Vectorized& b) { + return Vectorized{ + vec_add(a.vec0(), b.vec0()), vec_add(a.vec1(), b.vec1())}; +} + +template <> +Vectorized C10_ALWAYS_INLINE +operator-(const Vectorized& a, const Vectorized& b) { + return Vectorized{ + vec_sub(a.vec0(), b.vec0()), vec_sub(a.vec1(), b.vec1())}; +} + +template <> +Vectorized C10_ALWAYS_INLINE +operator*(const Vectorized& a, const Vectorized& b) { + return Vectorized{ + vec_mul(a.vec0(), b.vec0()), vec_mul(a.vec1(), b.vec1())}; +} + +template <> +Vectorized C10_ALWAYS_INLINE +operator/(const Vectorized& a, const Vectorized& b) { + return Vectorized{ + vec_div(a.vec0(), b.vec0()), vec_div(a.vec1(), b.vec1())}; +} + +template <> +Vectorized C10_ALWAYS_INLINE +operator&(const Vectorized& a, const Vectorized& b) { + return Vectorized{ + vec_and(a.vec0(), b.vec0()), vec_and(a.vec1(), b.vec1())}; +} + +template <> +Vectorized C10_ALWAYS_INLINE +operator|(const Vectorized& a, const Vectorized& b) { + return Vectorized{ + vec_or(a.vec0(), b.vec0()), vec_or(a.vec1(), b.vec1())}; +} + +template <> +Vectorized C10_ALWAYS_INLINE +operator^(const Vectorized& a, const Vectorized& b) { + return Vectorized{ + vec_xor(a.vec0(), b.vec0()), vec_xor(a.vec1(), b.vec1())}; +} + +} // namespace CPU_CAPABILITY +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } // namespace vec } // namespace at diff --git a/aten/src/ATen/cpu/vec/vec256/vsx/vec256_float_vsx.h b/aten/src/ATen/cpu/vec/vec256/vsx/vec256_float_vsx.h index 246f0e8a7f1e..04556891c5d7 100644 --- a/aten/src/ATen/cpu/vec/vec256/vsx/vec256_float_vsx.h +++ b/aten/src/ATen/cpu/vec/vec256/vsx/vec256_float_vsx.h @@ -1,8 +1,13 @@ #pragma once #include +<<<<<<< HEAD #include #include +======= +#include +#include +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #include namespace at { namespace vec { @@ -11,6 +16,12 @@ namespace vec { inline namespace CPU_CAPABILITY { template <> +<<<<<<< HEAD +======= +struct is_vec_specialized_for : std::bool_constant {}; + +template <> +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) class Vectorized { private: union { @@ -38,8 +49,15 @@ class Vectorized { C10_ALWAYS_INLINE Vectorized(vfloat32 v) : _vec0{v}, _vec1{v} {} C10_ALWAYS_INLINE Vectorized(vbool32 vmask) : _vecb0{vmask}, _vecb1{vmask} {} +<<<<<<< HEAD C10_ALWAYS_INLINE Vectorized(vfloat32 v1, vfloat32 v2) : _vec0{v1}, _vec1{v2} {} C10_ALWAYS_INLINE Vectorized(vbool32 v1, vbool32 v2) : _vecb0{v1}, _vecb1{v2} {} +======= + C10_ALWAYS_INLINE Vectorized(vfloat32 v1, vfloat32 v2) + : _vec0{v1}, _vec1{v2} {} + C10_ALWAYS_INLINE Vectorized(vbool32 v1, vbool32 v2) + : _vecb0{v1}, _vecb1{v2} {} +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) C10_ALWAYS_INLINE Vectorized(float scalar) : _vec0{vec_splats(scalar)}, _vec1{vec_splats(scalar)} {} C10_ALWAYS_INLINE Vectorized( @@ -61,62 +79,116 @@ class Vectorized { } template +<<<<<<< HEAD static std::enable_if_t> C10_ALWAYS_INLINE blend(const Vectorized& a, const Vectorized& b) { +======= + static std::enable_if_t> + C10_ALWAYS_INLINE + blend(const Vectorized& a, const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return a; } template +<<<<<<< HEAD static std::enable_if_t> C10_ALWAYS_INLINE blend(const Vectorized& a, const Vectorized& b) { +======= + static std::enable_if_t> + C10_ALWAYS_INLINE + blend(const Vectorized& a, const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return b; } template +<<<<<<< HEAD static std::enable_if_t> C10_ALWAYS_INLINE blend(const Vectorized& a, const Vectorized& b) { +======= + static std::enable_if_t> + C10_ALWAYS_INLINE + blend(const Vectorized& a, const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return {b._vec0, a._vec1}; } template +<<<<<<< HEAD static std::enable_if_t> C10_ALWAYS_INLINE blend(const Vectorized& a, const Vectorized& b) { +======= + static std::enable_if_t> + C10_ALWAYS_INLINE + blend(const Vectorized& a, const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return {a._vec0, b._vec1}; } template +<<<<<<< HEAD static std::enable_if_t> C10_ALWAYS_INLINE blend(const Vectorized& a, const Vectorized& b) { +======= + static std::enable_if_t> + C10_ALWAYS_INLINE + blend(const Vectorized& a, const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) const vbool32 mask_1st = VsxMask1(mask); return {(vfloat32)vec_sel(a._vec0, b._vec0, mask_1st), a._vec1}; } template +<<<<<<< HEAD static std::enable_if_t> C10_ALWAYS_INLINE blend(const Vectorized& a, const Vectorized& b) { +======= + static std::enable_if_t> + C10_ALWAYS_INLINE + blend(const Vectorized& a, const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) const vbool32 mask_1st = VsxMask1(mask); return {(vfloat32)vec_sel(a._vec0, b._vec0, mask_1st), b._vec1}; } template +<<<<<<< HEAD static std::enable_if_t> C10_ALWAYS_INLINE blend(const Vectorized& a, const Vectorized& b) { +======= + static std::enable_if_t> + C10_ALWAYS_INLINE + blend(const Vectorized& a, const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) const vbool32 mask_2nd = VsxMask2(mask); // generated masks return {a._vec0, (vfloat32)vec_sel(a._vec1, b._vec1, mask_2nd)}; } template +<<<<<<< HEAD static std::enable_if_t> C10_ALWAYS_INLINE blend(const Vectorized& a, const Vectorized& b) { +======= + static std::enable_if_t> + C10_ALWAYS_INLINE + blend(const Vectorized& a, const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) const vbool32 mask_2nd = VsxMask2(mask); // generated masks return {b._vec0, (vfloat32)vec_sel(a._vec1, b._vec1, mask_2nd)}; } template +<<<<<<< HEAD static std::enable_if_t> C10_ALWAYS_INLINE blend(const Vectorized& a, const Vectorized& b) { +======= + static std::enable_if_t> + C10_ALWAYS_INLINE + blend(const Vectorized& a, const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) const vbool32 mask_1st = VsxMask1(mask); const vbool32 mask_2nd = VsxMask2(mask); return { @@ -136,7 +208,13 @@ class Vectorized { } template +<<<<<<< HEAD static Vectorized arange(float base = 0.f, step_t step = static_cast(1)) { +======= + static Vectorized arange( + float base = 0.f, + step_t step = static_cast(1)) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return Vectorized( base, base + step, @@ -212,8 +290,14 @@ class Vectorized { return ret; } +<<<<<<< HEAD Vectorized mapbi(float (*const f)(float, float), const Vectorized& other) const { +======= + Vectorized mapbi( + float (*const f)(float, float), + const Vectorized& other) const { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) Vectorized ret; for (int i = 0; i < size() / 2; i++) { ret._vec0[i] = f(_vec0[i], other._vec0[i]); @@ -235,6 +319,7 @@ class Vectorized { } bool has_inf_nan() const { +<<<<<<< HEAD for (const auto i : c10::irange(size()/2)) { if(_isnan(_vec0[i]) || _isinf(_vec0[i])) { return true; @@ -242,6 +327,15 @@ class Vectorized { } for (const auto i : c10::irange(size()/2)) { if(_isnan(_vec1[i]) || _isinf(_vec1[i])) { +======= + for (const auto i : c10::irange(size() / 2)) { + if (_isnan(_vec0[i]) || _isinf(_vec0[i])) { + return true; + } + } + for (const auto i : c10::irange(size() / 2)) { + if (_isnan(_vec1[i]) || _isinf(_vec1[i])) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return true; } } @@ -268,7 +362,11 @@ class Vectorized { return {Sleef_acosf4_u10(_vec0), Sleef_acosf4_u10(_vec1)}; } Vectorized C10_ALWAYS_INLINE acosh() const { +<<<<<<< HEAD return {Sleef_acoshf4_u10(_vec0), Sleef_acoshf4_u10(_vec1)}; +======= + return {Sleef_acoshf4_u10(_vec0), Sleef_acoshf4_u10(_vec1)}; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } Vectorized C10_ALWAYS_INLINE asin() const { return {Sleef_asinf4_u10(_vec0), Sleef_asinf4_u10(_vec1)}; @@ -283,10 +381,20 @@ class Vectorized { return {Sleef_atanhf4_u10(_vec0), Sleef_atanhf4_u10(_vec1)}; } Vectorized atan2(const Vectorized& b) const { +<<<<<<< HEAD return {Sleef_atan2f4_u10(_vec0, b._vec0), Sleef_atan2f4_u10(_vec1, b._vec1)}; } Vectorized copysign(const Vectorized &sign) const { return {Sleef_copysignf4(_vec0, sign._vec0), Sleef_copysignf4(_vec1, sign._vec1)}; +======= + return { + Sleef_atan2f4_u10(_vec0, b._vec0), Sleef_atan2f4_u10(_vec1, b._vec1)}; + } + Vectorized copysign(const Vectorized& sign) const { + return { + Sleef_copysignf4(_vec0, sign._vec0), + Sleef_copysignf4(_vec1, sign._vec1)}; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } Vectorized lgamma() const { return {Sleef_lgammaf4_u10(_vec0), Sleef_lgammaf4_u10(_vec1)}; @@ -305,7 +413,13 @@ class Vectorized { Vectorized angle() const { auto tmp = blendv( +<<<<<<< HEAD Vectorized(0), Vectorized(c10::pi), *this < Vectorized(0)); +======= + Vectorized(0), + Vectorized(c10::pi), + *this < Vectorized(0)); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return blendv(tmp, *this, isnan()); } Vectorized real() const { @@ -393,6 +507,7 @@ class Vectorized { } Vectorized C10_ALWAYS_INLINE pow(const Vectorized& exp) const { +<<<<<<< HEAD return {Sleef_powf4_u10(_vec0, exp._vec0), Sleef_powf4_u10(_vec1, exp._vec1)}; } @@ -406,6 +521,24 @@ class Vectorized { Vectorized nextafter(const Vectorized& b) const { return {Sleef_nextafterf4(_vec0, b._vec0), Sleef_nextafterf4(_vec1, b._vec1)}; +======= + return { + Sleef_powf4_u10(_vec0, exp._vec0), Sleef_powf4_u10(_vec1, exp._vec1)}; + } + + Vectorized fmod(const Vectorized& b) const { + return {Sleef_fmodf4(_vec0, b._vec0), Sleef_fmodf4(_vec1, b._vec1)}; + } + + Vectorized hypot(const Vectorized& b) const { + return { + Sleef_hypotf4_u05(_vec0, b._vec0), Sleef_hypotf4_u05(_vec1, b._vec1)}; + } + + Vectorized nextafter(const Vectorized& b) const { + return { + Sleef_nextafterf4(_vec0, b._vec0), Sleef_nextafterf4(_vec1, b._vec1)}; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } Vectorized igamma(const Vectorized& x) const { @@ -453,16 +586,29 @@ class Vectorized { }; template <> +<<<<<<< HEAD Vectorized inline maximum(const Vectorized& a, const Vectorized& b) { +======= +Vectorized inline maximum( + const Vectorized& a, + const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return a.maximum(b); } template <> +<<<<<<< HEAD Vectorized inline minimum(const Vectorized& a, const Vectorized& b) { +======= +Vectorized inline minimum( + const Vectorized& a, + const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return a.minimum(b); } template <> +<<<<<<< HEAD Vectorized C10_ALWAYS_INLINE operator+(const Vectorized& a, const Vectorized& b) { return Vectorized{vec_add(a.vec0(), b.vec0()), vec_add(a.vec1(), b.vec1())}; } @@ -498,5 +644,56 @@ Vectorized C10_ALWAYS_INLINE operator^(const Vectorized& a, const } } // namespace +======= +Vectorized C10_ALWAYS_INLINE +operator+(const Vectorized& a, const Vectorized& b) { + return Vectorized{ + vec_add(a.vec0(), b.vec0()), vec_add(a.vec1(), b.vec1())}; +} + +template <> +Vectorized C10_ALWAYS_INLINE +operator-(const Vectorized& a, const Vectorized& b) { + return Vectorized{ + vec_sub(a.vec0(), b.vec0()), vec_sub(a.vec1(), b.vec1())}; +} + +template <> +Vectorized C10_ALWAYS_INLINE +operator*(const Vectorized& a, const Vectorized& b) { + return Vectorized{ + vec_mul(a.vec0(), b.vec0()), vec_mul(a.vec1(), b.vec1())}; +} + +template <> +Vectorized C10_ALWAYS_INLINE +operator/(const Vectorized& a, const Vectorized& b) { + return Vectorized{ + vec_div(a.vec0(), b.vec0()), vec_div(a.vec1(), b.vec1())}; +} + +template <> +Vectorized C10_ALWAYS_INLINE +operator&(const Vectorized& a, const Vectorized& b) { + return Vectorized{ + vec_and(a.vec0(), b.vec0()), vec_and(a.vec1(), b.vec1())}; +} + +template <> +Vectorized C10_ALWAYS_INLINE +operator|(const Vectorized& a, const Vectorized& b) { + return Vectorized{ + vec_or(a.vec0(), b.vec0()), vec_or(a.vec1(), b.vec1())}; +} + +template <> +Vectorized C10_ALWAYS_INLINE +operator^(const Vectorized& a, const Vectorized& b) { + return Vectorized{ + vec_xor(a.vec0(), b.vec0()), vec_xor(a.vec1(), b.vec1())}; +} + +} // namespace CPU_CAPABILITY +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } // namespace vec } // namespace at diff --git a/aten/src/ATen/cpu/vec/vec256/vsx/vec256_int16_vsx.h b/aten/src/ATen/cpu/vec/vec256/vsx/vec256_int16_vsx.h index ae146dae4d42..006053d6fed3 100644 --- a/aten/src/ATen/cpu/vec/vec256/vsx/vec256_int16_vsx.h +++ b/aten/src/ATen/cpu/vec/vec256/vsx/vec256_int16_vsx.h @@ -1,14 +1,25 @@ #pragma once #include +<<<<<<< HEAD #include #include +======= +#include +#include +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) namespace at { namespace vec { // See Note [CPU_CAPABILITY namespace] inline namespace CPU_CAPABILITY { template <> +<<<<<<< HEAD +======= +struct is_vec_specialized_for : std::bool_constant {}; + +template <> +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) class Vectorized { private: union { @@ -35,7 +46,12 @@ class Vectorized { C10_ALWAYS_INLINE Vectorized(vint16 v) : _vec0{v}, _vec1{v} {} C10_ALWAYS_INLINE Vectorized(vbool16 vmask) : _vecb0{vmask}, _vecb1{vmask} {} C10_ALWAYS_INLINE Vectorized(vint16 v1, vint16 v2) : _vec0{v1}, _vec1{v2} {} +<<<<<<< HEAD C10_ALWAYS_INLINE Vectorized(vbool16 v1, vbool16 v2) : _vecb0{v1}, _vecb1{v2} {} +======= + C10_ALWAYS_INLINE Vectorized(vbool16 v1, vbool16 v2) + : _vecb0{v1}, _vecb1{v2} {} +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) C10_ALWAYS_INLINE Vectorized(int16_t scalar) : _vec0{vec_splats(scalar)}, _vec1{vec_splats(scalar)} {} @@ -89,7 +105,12 @@ class Vectorized { template static std::enable_if_t<(mask & 65535) == 65535, Vectorized> +<<<<<<< HEAD C10_ALWAYS_INLINE blend(const Vectorized& a, const Vectorized& b) { +======= + C10_ALWAYS_INLINE + blend(const Vectorized& a, const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return b; } @@ -101,7 +122,12 @@ class Vectorized { template static std::enable_if_t<(mask > 0 && mask < 255), Vectorized> +<<<<<<< HEAD C10_ALWAYS_INLINE blend(const Vectorized& a, const Vectorized& b) { +======= + C10_ALWAYS_INLINE + blend(const Vectorized& a, const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) constexpr int16_t g0 = (mask & 1) * 0xffff; constexpr int16_t g1 = ((mask & 2) >> 1) * 0xffff; constexpr int16_t g2 = ((mask & 4) >> 2) * 0xffff; @@ -119,7 +145,12 @@ class Vectorized { static std::enable_if_t< (mask > 255 && (mask & 65535) != 65535 && ((mask & 255) == 255)), Vectorized> +<<<<<<< HEAD C10_ALWAYS_INLINE blend(const Vectorized& a, const Vectorized& b) { +======= + C10_ALWAYS_INLINE + blend(const Vectorized& a, const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) constexpr int16_t g0_2 = (mask & 1) * 0xffff; constexpr int16_t g1_2 = ((mask & 2) >> 1) * 0xffff; constexpr int16_t g2_2 = ((mask & 4) >> 2) * 0xffff; @@ -139,7 +170,12 @@ class Vectorized { static std::enable_if_t< (mask > 255 && ((mask & 65535) != 65535) && ((mask & 255) == 0)), Vectorized> +<<<<<<< HEAD C10_ALWAYS_INLINE blend(const Vectorized& a, const Vectorized& b) { +======= + C10_ALWAYS_INLINE + blend(const Vectorized& a, const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) constexpr int16_t mask2 = (mask & 65535) >> 16; constexpr int16_t g0_2 = (mask & 1) * 0xffff; constexpr int16_t g1_2 = ((mask & 2) >> 1) * 0xffff; @@ -161,7 +197,12 @@ class Vectorized { (mask > 255 && ((mask & 65535) != 65535) && ((mask & 255) != 0) && ((mask & 255) != 255)), Vectorized> +<<<<<<< HEAD C10_ALWAYS_INLINE blend(const Vectorized& a, const Vectorized& b) { +======= + C10_ALWAYS_INLINE + blend(const Vectorized& a, const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) constexpr int16_t g0 = (mask & 1) * 0xffff; constexpr int16_t g1 = ((mask & 2) >> 1) * 0xffff; constexpr int16_t g2 = ((mask & 4) >> 2) * 0xffff; @@ -202,7 +243,13 @@ class Vectorized { } template +<<<<<<< HEAD static Vectorized arange(int16_t base = 0, step_t step = static_cast(1)) { +======= + static Vectorized arange( + int16_t base = 0, + step_t step = static_cast(1)) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return Vectorized( base, base + step, @@ -282,7 +329,12 @@ class Vectorized { __at_align__ value_type tmp_values[size()]; vec_vsx_st(_vec0, offset0, tmp_values); vec_vsx_st(_vec1, offset16, tmp_values); +<<<<<<< HEAD std::memcpy(ptr, tmp_values, std::min(count, size()) * sizeof(value_type)); +======= + std::memcpy( + ptr, tmp_values, std::min(count, size()) * sizeof(value_type)); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } } const int16_t& operator[](int idx) const = delete; @@ -290,7 +342,13 @@ class Vectorized { Vectorized angle() const { return blendv( +<<<<<<< HEAD Vectorized(0), Vectorized(c10::pi), *this < Vectorized(0)); +======= + Vectorized(0), + Vectorized(c10::pi), + *this < Vectorized(0)); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } Vectorized real() const { return *this; @@ -335,6 +393,7 @@ class Vectorized { }; template <> +<<<<<<< HEAD Vectorized inline operator<<(const Vectorized& a, const Vectorized& b) { vuint16 shift_vec0 = reinterpret_cast(b.vec0()); vuint16 shift_vec1 = reinterpret_cast(b.vec1()); @@ -346,6 +405,25 @@ Vectorized inline operator>>(const Vectorized& a, const Vector vuint16 shift_vec0 = reinterpret_cast(b.vec0()); vuint16 shift_vec1 = reinterpret_cast(b.vec1()) ; return Vectorized{vec_sr(a.vec0(), shift_vec0), vec_sr(a.vec1(), shift_vec1)}; +======= +Vectorized inline operator<<( + const Vectorized& a, + const Vectorized& b) { + vuint16 shift_vec0 = reinterpret_cast(b.vec0()); + vuint16 shift_vec1 = reinterpret_cast(b.vec1()); + return Vectorized{ + vec_sl(a.vec0(), shift_vec0), vec_sl(a.vec1(), shift_vec1)}; +} + +template <> +Vectorized inline operator>>( + const Vectorized& a, + const Vectorized& b) { + vuint16 shift_vec0 = reinterpret_cast(b.vec0()); + vuint16 shift_vec1 = reinterpret_cast(b.vec1()); + return Vectorized{ + vec_sr(a.vec0(), shift_vec0), vec_sr(a.vec1(), shift_vec1)}; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } template <> @@ -363,6 +441,7 @@ Vectorized inline minimum( } template <> +<<<<<<< HEAD Vectorized C10_ALWAYS_INLINE operator+(const Vectorized& a, const Vectorized& b) { return Vectorized{vec_add(a.vec0(), b.vec0()), vec_add(a.vec1(), b.vec1())}; } @@ -398,5 +477,55 @@ Vectorized C10_ALWAYS_INLINE operator^(const Vectorized& a, co } } // namespace +======= +Vectorized C10_ALWAYS_INLINE +operator+(const Vectorized& a, const Vectorized& b) { + return Vectorized{ + vec_add(a.vec0(), b.vec0()), vec_add(a.vec1(), b.vec1())}; +} + +template <> +Vectorized C10_ALWAYS_INLINE +operator-(const Vectorized& a, const Vectorized& b) { + return Vectorized{ + vec_sub(a.vec0(), b.vec0()), vec_sub(a.vec1(), b.vec1())}; +} + +template <> +Vectorized C10_ALWAYS_INLINE +operator*(const Vectorized& a, const Vectorized& b) { + return Vectorized{ + vec_mul(a.vec0(), b.vec0()), vec_mul(a.vec1(), b.vec1())}; +} + +template <> +Vectorized C10_ALWAYS_INLINE +operator/(const Vectorized& a, const Vectorized& b) { + return Vectorized{a.vec0() / b.vec0(), a.vec1() / b.vec1()}; +} + +template <> +Vectorized C10_ALWAYS_INLINE +operator&(const Vectorized& a, const Vectorized& b) { + return Vectorized{ + vec_and(a.vec0(), b.vec0()), vec_and(a.vec1(), b.vec1())}; +} + +template <> +Vectorized C10_ALWAYS_INLINE +operator|(const Vectorized& a, const Vectorized& b) { + return Vectorized{ + vec_or(a.vec0(), b.vec0()), vec_or(a.vec1(), b.vec1())}; +} + +template <> +Vectorized C10_ALWAYS_INLINE +operator^(const Vectorized& a, const Vectorized& b) { + return Vectorized{ + vec_xor(a.vec0(), b.vec0()), vec_xor(a.vec1(), b.vec1())}; +} + +} // namespace CPU_CAPABILITY +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } // namespace vec } // namespace at diff --git a/aten/src/ATen/cpu/vec/vec256/vsx/vec256_int32_vsx.h b/aten/src/ATen/cpu/vec/vec256/vsx/vec256_int32_vsx.h index 98401381c6e8..6ec23580acac 100644 --- a/aten/src/ATen/cpu/vec/vec256/vsx/vec256_int32_vsx.h +++ b/aten/src/ATen/cpu/vec/vec256/vsx/vec256_int32_vsx.h @@ -1,14 +1,25 @@ #pragma once #include +<<<<<<< HEAD #include #include +======= +#include +#include +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) namespace at { namespace vec { // See Note [CPU_CAPABILITY namespace] inline namespace CPU_CAPABILITY { template <> +<<<<<<< HEAD +======= +struct is_vec_specialized_for : std::bool_constant {}; + +template <> +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) class Vectorized { private: union { @@ -35,7 +46,12 @@ class Vectorized { C10_ALWAYS_INLINE Vectorized(vint32 v) : _vec0{v}, _vec1{v} {} C10_ALWAYS_INLINE Vectorized(vbool32 vmask) : _vecb0{vmask}, _vecb1{vmask} {} C10_ALWAYS_INLINE Vectorized(vint32 v1, vint32 v2) : _vec0{v1}, _vec1{v2} {} +<<<<<<< HEAD C10_ALWAYS_INLINE Vectorized(vbool32 v1, vbool32 v2) : _vecb0{v1}, _vecb1{v2} {} +======= + C10_ALWAYS_INLINE Vectorized(vbool32 v1, vbool32 v2) + : _vecb0{v1}, _vecb1{v2} {} +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) C10_ALWAYS_INLINE Vectorized(int32_t scalar) : _vec0{vec_splats(scalar)}, _vec1{vec_splats(scalar)} {} C10_ALWAYS_INLINE Vectorized( @@ -63,8 +79,14 @@ class Vectorized { } template +<<<<<<< HEAD static std::enable_if_t<(mask & 255) == 255, Vectorized> C10_ALWAYS_INLINE blend(const Vectorized& a, const Vectorized& b) { +======= + static std::enable_if_t<(mask & 255) == 255, Vectorized> + C10_ALWAYS_INLINE + blend(const Vectorized& a, const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return b; } @@ -76,7 +98,12 @@ class Vectorized { template static std::enable_if_t<(mask > 0 && mask < 15), Vectorized> +<<<<<<< HEAD C10_ALWAYS_INLINE blend(const Vectorized& a, const Vectorized& b) { +======= + C10_ALWAYS_INLINE + blend(const Vectorized& a, const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) constexpr uint32_t g0 = (mask & 1) * 0xffffffff; constexpr uint32_t g1 = ((mask & 2) >> 1) * 0xffffffff; constexpr uint32_t g2 = ((mask & 4) >> 2) * 0xffffffff; @@ -90,7 +117,12 @@ class Vectorized { static std::enable_if_t< (mask > 15 && (mask & 255) != 255 && ((mask & 15) == 15)), Vectorized> +<<<<<<< HEAD C10_ALWAYS_INLINE blend(const Vectorized& a, const Vectorized& b) { +======= + C10_ALWAYS_INLINE + blend(const Vectorized& a, const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) constexpr uint32_t mask2 = (mask & 255) >> 4; constexpr uint32_t g0_2 = (mask2 & 1) * 0xffffffff; constexpr uint32_t g1_2 = ((mask2 & 2) >> 1) * 0xffffffff; @@ -106,7 +138,12 @@ class Vectorized { static std::enable_if_t< (mask > 15 && ((mask & 255) != 255) && ((mask & 15) == 0)), Vectorized> +<<<<<<< HEAD C10_ALWAYS_INLINE blend(const Vectorized& a, const Vectorized& b) { +======= + C10_ALWAYS_INLINE + blend(const Vectorized& a, const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) constexpr uint32_t mask2 = (mask & 255) >> 4; constexpr uint32_t g0_2 = (mask2 & 1) * 0xffffffff; constexpr uint32_t g1_2 = ((mask2 & 2) >> 1) * 0xffffffff; @@ -123,7 +160,12 @@ class Vectorized { (mask > 15 && ((mask & 255) != 255) && ((mask & 15) != 0) && ((mask & 15) != 15)), Vectorized> +<<<<<<< HEAD C10_ALWAYS_INLINE blend(const Vectorized& a, const Vectorized& b) { +======= + C10_ALWAYS_INLINE + blend(const Vectorized& a, const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) constexpr uint32_t g0 = (mask & 1) * 0xffffffff; constexpr uint32_t g1 = ((mask & 2) >> 1) * 0xffffffff; constexpr uint32_t g2 = ((mask & 4) >> 2) * 0xffffffff; @@ -155,7 +197,13 @@ class Vectorized { } template +<<<<<<< HEAD static Vectorized arange(int32_t base = 0.f, step_t step = static_cast(1)) { +======= + static Vectorized arange( + int32_t base = 0.f, + step_t step = static_cast(1)) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return Vectorized( base, base + step, @@ -221,7 +269,13 @@ class Vectorized { Vectorized angle() const { return blendv( +<<<<<<< HEAD Vectorized(0), Vectorized(c10::pi), *this < Vectorized(0)); +======= + Vectorized(0), + Vectorized(c10::pi), + *this < Vectorized(0)); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } Vectorized real() const { return *this; @@ -266,6 +320,7 @@ class Vectorized { }; template <> +<<<<<<< HEAD Vectorized inline operator<<(const Vectorized& a, const Vectorized& b) { vuint32 shift_vec0 = reinterpret_cast(b.vec0()); vuint32 shift_vec1 = reinterpret_cast(b.vec1()) ; @@ -277,6 +332,25 @@ Vectorized inline operator>>(const Vectorized& a, const Vector vuint32 shift_vec0 = reinterpret_cast(b.vec0()); vuint32 shift_vec1 = reinterpret_cast(b.vec1()) ; return Vectorized{vec_sr(a.vec0(), shift_vec0), vec_sr(a.vec1(), shift_vec1)}; +======= +Vectorized inline operator<<( + const Vectorized& a, + const Vectorized& b) { + vuint32 shift_vec0 = reinterpret_cast(b.vec0()); + vuint32 shift_vec1 = reinterpret_cast(b.vec1()); + return Vectorized{ + vec_sl(a.vec0(), shift_vec0), vec_sl(a.vec1(), shift_vec1)}; +} + +template <> +Vectorized inline operator>>( + const Vectorized& a, + const Vectorized& b) { + vuint32 shift_vec0 = reinterpret_cast(b.vec0()); + vuint32 shift_vec1 = reinterpret_cast(b.vec1()); + return Vectorized{ + vec_sr(a.vec0(), shift_vec0), vec_sr(a.vec1(), shift_vec1)}; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } template <> @@ -294,6 +368,7 @@ Vectorized inline minimum( } template <> +<<<<<<< HEAD Vectorized C10_ALWAYS_INLINE operator+(const Vectorized& a, const Vectorized& b) { return Vectorized{vec_add(a.vec0(), b.vec0()), vec_add(a.vec1(), b.vec1())}; } @@ -329,5 +404,55 @@ Vectorized C10_ALWAYS_INLINE operator^(const Vectorized& a, co } } // namespace +======= +Vectorized C10_ALWAYS_INLINE +operator+(const Vectorized& a, const Vectorized& b) { + return Vectorized{ + vec_add(a.vec0(), b.vec0()), vec_add(a.vec1(), b.vec1())}; +} + +template <> +Vectorized C10_ALWAYS_INLINE +operator-(const Vectorized& a, const Vectorized& b) { + return Vectorized{ + vec_sub(a.vec0(), b.vec0()), vec_sub(a.vec1(), b.vec1())}; +} + +template <> +Vectorized C10_ALWAYS_INLINE +operator*(const Vectorized& a, const Vectorized& b) { + return Vectorized{ + vec_mul(a.vec0(), b.vec0()), vec_mul(a.vec1(), b.vec1())}; +} + +template <> +Vectorized C10_ALWAYS_INLINE +operator/(const Vectorized& a, const Vectorized& b) { + return Vectorized{a.vec0() / b.vec0(), a.vec1() / b.vec1()}; +} + +template <> +Vectorized C10_ALWAYS_INLINE +operator&(const Vectorized& a, const Vectorized& b) { + return Vectorized{ + vec_and(a.vec0(), b.vec0()), vec_and(a.vec1(), b.vec1())}; +} + +template <> +Vectorized C10_ALWAYS_INLINE +operator|(const Vectorized& a, const Vectorized& b) { + return Vectorized{ + vec_or(a.vec0(), b.vec0()), vec_or(a.vec1(), b.vec1())}; +} + +template <> +Vectorized C10_ALWAYS_INLINE +operator^(const Vectorized& a, const Vectorized& b) { + return Vectorized{ + vec_xor(a.vec0(), b.vec0()), vec_xor(a.vec1(), b.vec1())}; +} + +} // namespace CPU_CAPABILITY +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } // namespace vec } // namespace at diff --git a/aten/src/ATen/cpu/vec/vec256/vsx/vec256_int64_vsx.h b/aten/src/ATen/cpu/vec/vec256/vsx/vec256_int64_vsx.h index f8217930fa49..fa164b13672c 100644 --- a/aten/src/ATen/cpu/vec/vec256/vsx/vec256_int64_vsx.h +++ b/aten/src/ATen/cpu/vec/vec256/vsx/vec256_int64_vsx.h @@ -1,14 +1,25 @@ #pragma once #include +<<<<<<< HEAD #include #include +======= +#include +#include +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) namespace at { namespace vec { // See Note [CPU_CAPABILITY namespace] inline namespace CPU_CAPABILITY { template <> +<<<<<<< HEAD +======= +struct is_vec_specialized_for : std::bool_constant {}; + +template <> +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) class Vectorized { private: union { @@ -36,7 +47,12 @@ class Vectorized { C10_ALWAYS_INLINE Vectorized(vint64 v) : _vec0{v}, _vec1{v} {} C10_ALWAYS_INLINE Vectorized(vbool64 vmask) : _vecb0{vmask}, _vecb1{vmask} {} C10_ALWAYS_INLINE Vectorized(vint64 v1, vint64 v2) : _vec0{v1}, _vec1{v2} {} +<<<<<<< HEAD C10_ALWAYS_INLINE Vectorized(vbool64 v1, vbool64 v2) : _vecb0{v1}, _vecb1{v2} {} +======= + C10_ALWAYS_INLINE Vectorized(vbool64 v1, vbool64 v2) + : _vecb0{v1}, _vecb1{v2} {} +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) C10_ALWAYS_INLINE Vectorized(int64_t scalar) : _vec0{vec_splats(scalar)}, _vec1{vec_splats(scalar)} {} C10_ALWAYS_INLINE Vectorized( @@ -66,14 +82,26 @@ class Vectorized { } template +<<<<<<< HEAD static std::enable_if_t<(mask & 15) == 15, Vectorized> C10_ALWAYS_INLINE blend(const Vectorized& a, const Vectorized& b) { +======= + static std::enable_if_t<(mask & 15) == 15, Vectorized> + C10_ALWAYS_INLINE + blend(const Vectorized& a, const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return b; } template +<<<<<<< HEAD static std::enable_if_t<(mask > 0 && mask < 3), Vectorized> C10_ALWAYS_INLINE blend(const Vectorized& a, const Vectorized& b) { +======= + static std::enable_if_t<(mask > 0 && mask < 3), Vectorized> + C10_ALWAYS_INLINE + blend(const Vectorized& a, const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) constexpr uint64_t g0 = (mask & 1) * 0xffffffffffffffff; constexpr uint64_t g1 = ((mask & 2) >> 1) * 0xffffffffffffffff; const vbool64 mask_1st = (vbool64){g0, g1}; @@ -82,7 +110,12 @@ class Vectorized { template static std::enable_if_t<(mask > 3) && (mask & 3) == 0, Vectorized> +<<<<<<< HEAD C10_ALWAYS_INLINE blend(const Vectorized& a, const Vectorized& b) { +======= + C10_ALWAYS_INLINE + blend(const Vectorized& a, const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) constexpr uint64_t g0_2 = ((mask & 4) >> 2) * 0xffffffffffffffff; constexpr uint64_t g1_2 = ((mask & 8) >> 3) * 0xffffffffffffffff; @@ -94,7 +127,12 @@ class Vectorized { static std::enable_if_t< (mask > 3) && (mask & 3) != 0 && (mask & 15) != 15, Vectorized> +<<<<<<< HEAD C10_ALWAYS_INLINE blend(const Vectorized& a, const Vectorized& b) { +======= + C10_ALWAYS_INLINE + blend(const Vectorized& a, const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) constexpr uint64_t g0 = (mask & 1) * 0xffffffffffffffff; constexpr uint64_t g1 = ((mask & 2) >> 1) * 0xffffffffffffffff; constexpr uint64_t g0_2 = ((mask & 4) >> 2) * 0xffffffffffffffff; @@ -118,8 +156,16 @@ class Vectorized { vec_sel(a._vec1, b._vec1, mask._vecb1)}; } template +<<<<<<< HEAD static Vectorized arange(int64_t base = 0., step_t step = static_cast(1)) { return Vectorized(base, base + step, base + 2 * step, base + 3 * step); +======= + static Vectorized arange( + int64_t base = 0., + step_t step = static_cast(1)) { + return Vectorized( + base, base + step, base + 2 * step, base + 3 * step); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } static Vectorized C10_ALWAYS_INLINE @@ -174,7 +220,13 @@ class Vectorized { Vectorized angle() const { return blendv( +<<<<<<< HEAD Vectorized(0), Vectorized(c10::pi), *this < Vectorized(0)); +======= + Vectorized(0), + Vectorized(c10::pi), + *this < Vectorized(0)); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } Vectorized real() const { return *this; @@ -219,6 +271,7 @@ class Vectorized { }; template <> +<<<<<<< HEAD Vectorized inline operator<<(const Vectorized& a, const Vectorized& b) { vuint64 shift_vec0 = reinterpret_cast(b.vec0()); vuint64 shift_vec1 = reinterpret_cast(b.vec1()) ; @@ -230,6 +283,25 @@ Vectorized inline operator>>(const Vectorized& a, const Vector vuint64 shift_vec0 = reinterpret_cast(b.vec0()); vuint64 shift_vec1 = reinterpret_cast(b.vec1()) ; return Vectorized{vec_sr(a.vec0(), shift_vec0), vec_sr(a.vec1(), shift_vec1)}; +======= +Vectorized inline operator<<( + const Vectorized& a, + const Vectorized& b) { + vuint64 shift_vec0 = reinterpret_cast(b.vec0()); + vuint64 shift_vec1 = reinterpret_cast(b.vec1()); + return Vectorized{ + vec_sl(a.vec0(), shift_vec0), vec_sl(a.vec1(), shift_vec1)}; +} + +template <> +Vectorized inline operator>>( + const Vectorized& a, + const Vectorized& b) { + vuint64 shift_vec0 = reinterpret_cast(b.vec0()); + vuint64 shift_vec1 = reinterpret_cast(b.vec1()); + return Vectorized{ + vec_sr(a.vec0(), shift_vec0), vec_sr(a.vec1(), shift_vec1)}; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } template <> @@ -247,6 +319,7 @@ Vectorized inline minimum( } template <> +<<<<<<< HEAD Vectorized C10_ALWAYS_INLINE operator+(const Vectorized& a, const Vectorized& b) { return Vectorized{vec_add(a.vec0(), b.vec0()), vec_add(a.vec1(), b.vec1())}; } @@ -282,5 +355,56 @@ Vectorized C10_ALWAYS_INLINE operator^(const Vectorized& a, co } } // namespace +======= +Vectorized C10_ALWAYS_INLINE +operator+(const Vectorized& a, const Vectorized& b) { + return Vectorized{ + vec_add(a.vec0(), b.vec0()), vec_add(a.vec1(), b.vec1())}; +} + +template <> +Vectorized C10_ALWAYS_INLINE +operator-(const Vectorized& a, const Vectorized& b) { + return Vectorized{ + vec_sub(a.vec0(), b.vec0()), vec_sub(a.vec1(), b.vec1())}; +} + +template <> +Vectorized C10_ALWAYS_INLINE +operator*(const Vectorized& a, const Vectorized& b) { + return Vectorized{ + vec_mul(a.vec0(), b.vec0()), vec_mul(a.vec1(), b.vec1())}; +} + +template <> +Vectorized C10_ALWAYS_INLINE +operator/(const Vectorized& a, const Vectorized& b) { + return Vectorized{ + vec_div(a.vec0(), b.vec0()), vec_div(a.vec1(), b.vec1())}; +} + +template <> +Vectorized C10_ALWAYS_INLINE +operator&(const Vectorized& a, const Vectorized& b) { + return Vectorized{ + vec_and(a.vec0(), b.vec0()), vec_and(a.vec1(), b.vec1())}; +} + +template <> +Vectorized C10_ALWAYS_INLINE +operator|(const Vectorized& a, const Vectorized& b) { + return Vectorized{ + vec_or(a.vec0(), b.vec0()), vec_or(a.vec1(), b.vec1())}; +} + +template <> +Vectorized C10_ALWAYS_INLINE +operator^(const Vectorized& a, const Vectorized& b) { + return Vectorized{ + vec_xor(a.vec0(), b.vec0()), vec_xor(a.vec1(), b.vec1())}; +} + +} // namespace CPU_CAPABILITY +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } // namespace vec } // namespace at diff --git a/aten/src/ATen/cpu/vec/vec256/vsx/vec256_qint32_vsx.h b/aten/src/ATen/cpu/vec/vec256/vsx/vec256_qint32_vsx.h index 8068d6102f4a..c74483abdef6 100644 --- a/aten/src/ATen/cpu/vec/vec256/vsx/vec256_qint32_vsx.h +++ b/aten/src/ATen/cpu/vec/vec256/vsx/vec256_qint32_vsx.h @@ -1,8 +1,13 @@ #pragma once #include +<<<<<<< HEAD #include #include +======= +#include +#include +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #include #include @@ -24,14 +29,24 @@ // specified by float_vec_return_type. // // When writing kernels with these vectors, it is expected that floating- +<<<<<<< HEAD // point operations will be carried out in a loop over Vectorized::float_num_vecs // iterations. +======= +// point operations will be carried out in a loop over +// Vectorized::float_num_vecs iterations. +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) namespace at { namespace vec { inline namespace CPU_CAPABILITY { template <> +<<<<<<< HEAD +======= +struct is_vec_specialized_for : std::bool_constant {}; +template <> +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) struct Vectorized { private: union { @@ -68,7 +83,12 @@ struct Vectorized { C10_ALWAYS_INLINE Vectorized(vint32 v) : _vec0{v}, _vec1{v} {} C10_ALWAYS_INLINE Vectorized(vbool32 vmask) : _vecb0{vmask}, _vecb1{vmask} {} C10_ALWAYS_INLINE Vectorized(vint32 v1, vint32 v2) : _vec0{v1}, _vec1{v2} {} +<<<<<<< HEAD C10_ALWAYS_INLINE Vectorized(vbool32 v1, vbool32 v2) : _vecb0{v1}, _vecb1{v2} {} +======= + C10_ALWAYS_INLINE Vectorized(vbool32 v1, vbool32 v2) + : _vecb0{v1}, _vecb1{v2} {} +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) Vectorized(const c10::qint32& val) : _vec0(vec_splats(val.val_)), _vec1(vec_splats(val.val_)) {} @@ -114,11 +134,23 @@ struct Vectorized { vfloat32 float_vals1 = vec_float(_vec1); vfloat32 scale_vec0 = scale.vec0(); vfloat32 scale_vec1 = scale.vec1(); +<<<<<<< HEAD vfloat32 scale_zp_premul0 = scale_zp_premul.vec0(); vfloat32 scale_zp_premul1 = scale_zp_premul.vec1(); return {Vectorized{ vec_madd(scale_vec0, float_vals0, scale_zp_premul0), vec_madd(scale_vec1, float_vals1, scale_zp_premul1)}}; +======= + vfloat32 zero_point_vec0 = zero_point.vec0(); + vfloat32 zero_point_vec1 = zero_point.vec1(); + + vfloat32 vec_sub_zero_point_0 = vec_sub(float_vals0, zero_point_vec0); + vfloat32 vec_sub_zero_point_1 = vec_sub(float_vals1, zero_point_vec1); + Vectorized vf0 = { + vec_mul(scale_vec0, vec_sub_zero_point_0), + vec_mul(scale_vec1, vec_sub_zero_point_1)}; + return {vf0}; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } float_vec_return_type dequantize( @@ -154,8 +186,13 @@ struct Vectorized { vecf1 = vec_mul(vecf1, inverse_scale_v); vecf0 = vec_add(vec_rint(vecf0), vec_zero_point); vecf1 = vec_add(vec_rint(vecf1), vec_zero_point); +<<<<<<< HEAD vint32 veci0 = vec_signed(vecf0); vint32 veci1 = vec_signed(vecf1); +======= + vint32 veci0 = vec_signed(vecf0); + vint32 veci1 = vec_signed(vecf1); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) veci0 = vec_max(veci0, vmin); veci1 = vec_max(veci1, vmin); @@ -199,8 +236,13 @@ struct Vectorized { vecf0 = vec_rint(vecf0); vecf1 = vec_rint(vecf1); +<<<<<<< HEAD vint32 veci0 = vec_add(vec_signed(vecf0),vec_zero_point); vint32 veci1 = vec_add(vec_signed(vecf1),vec_zero_point); +======= + vint32 veci0 = vec_add(vec_signed(vecf0), vec_zero_point); + vint32 veci1 = vec_add(vec_signed(vecf1), vec_zero_point); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) veci0 = vec_max(veci0, vmin); veci1 = vec_max(veci1, vmin); @@ -242,6 +284,7 @@ Vectorized inline minimum( } template <> +<<<<<<< HEAD Vectorized C10_ALWAYS_INLINE operator+(const Vectorized& a, const Vectorized& b) { return Vectorized{vec_add(a.vec0(), b.vec0()), vec_add(a.vec1(), b.vec1())}; } @@ -277,5 +320,55 @@ Vectorized C10_ALWAYS_INLINE operator^(const Vectorized C10_ALWAYS_INLINE +operator+(const Vectorized& a, const Vectorized& b) { + return Vectorized{ + vec_add(a.vec0(), b.vec0()), vec_add(a.vec1(), b.vec1())}; +} + +template <> +Vectorized C10_ALWAYS_INLINE +operator-(const Vectorized& a, const Vectorized& b) { + return Vectorized{ + vec_sub(a.vec0(), b.vec0()), vec_sub(a.vec1(), b.vec1())}; +} + +template <> +Vectorized C10_ALWAYS_INLINE +operator*(const Vectorized& a, const Vectorized& b) { + return Vectorized{ + vec_mul(a.vec0(), b.vec0()), vec_mul(a.vec1(), b.vec1())}; +} + +template <> +Vectorized C10_ALWAYS_INLINE +operator/(const Vectorized& a, const Vectorized& b) { + return Vectorized{a.vec0() / b.vec0(), a.vec1() / b.vec1()}; +} + +template <> +Vectorized C10_ALWAYS_INLINE +operator&(const Vectorized& a, const Vectorized& b) { + return Vectorized{ + vec_and(a.vec0(), b.vec0()), vec_and(a.vec1(), b.vec1())}; +} + +template <> +Vectorized C10_ALWAYS_INLINE +operator|(const Vectorized& a, const Vectorized& b) { + return Vectorized{ + vec_or(a.vec0(), b.vec0()), vec_or(a.vec1(), b.vec1())}; +} + +template <> +Vectorized C10_ALWAYS_INLINE +operator^(const Vectorized& a, const Vectorized& b) { + return Vectorized{ + vec_xor(a.vec0(), b.vec0()), vec_xor(a.vec1(), b.vec1())}; +} + +} // namespace CPU_CAPABILITY +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } // namespace vec } // namespace at diff --git a/aten/src/ATen/cpu/vec/vec256/vsx/vec256_qint8_vsx.h b/aten/src/ATen/cpu/vec/vec256/vsx/vec256_qint8_vsx.h index f67d42a4cb51..03087e55802d 100644 --- a/aten/src/ATen/cpu/vec/vec256/vsx/vec256_qint8_vsx.h +++ b/aten/src/ATen/cpu/vec/vec256/vsx/vec256_qint8_vsx.h @@ -1,8 +1,13 @@ #pragma once #include +<<<<<<< HEAD #include #include +======= +#include +#include +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #include #include @@ -24,14 +29,24 @@ // specified by float_vec_return_type. // // When writing kernels with these vectors, it is expected that floating- +<<<<<<< HEAD // point operations will be carried out in a loop over Vectorized::float_num_vecs // iterations. +======= +// point operations will be carried out in a loop over +// Vectorized::float_num_vecs iterations. +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) namespace at { namespace vec { inline namespace CPU_CAPABILITY { template <> +<<<<<<< HEAD +======= +struct is_vec_specialized_for : std::bool_constant {}; +template <> +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) struct Vectorized { private: union { @@ -144,6 +159,7 @@ struct Vectorized { vfloat32 vecf1_3 = vec_float(veci7); vfloat32 scale_vec0 = scale.vec0(); vfloat32 scale_vec1 = scale.vec1(); +<<<<<<< HEAD vfloat32 scale_zp_premul0 = scale_zp_premul.vec0(); vfloat32 scale_zp_premul1 = scale_zp_premul.vec1(); return { @@ -164,6 +180,41 @@ struct Vectorized { float_vec_return_type C10_ALWAYS_INLINE dequantize( Vectorized scale, Vectorized zero_point) const { +======= + + vfloat32 zero_point_vec0 = zero_point.vec0(); + vfloat32 zero_point_vec1 = zero_point.vec1(); + + vfloat32 vec_substract_src_zp0_0 = vec_sub(vecf0_0, zero_point_vec0); + vfloat32 vec_substract_src_zp1_0 = vec_sub(vecf1_0, zero_point_vec1); + Vectorized vf0_zp = { + vec_mul(scale_vec0, vec_substract_src_zp0_0), + vec_mul(scale_vec1, vec_substract_src_zp1_0)}; + + vfloat32 vec_substract_src_zp0_1 = vec_sub(vecf0_1, zero_point_vec0); + vfloat32 vec_substract_src_zp1_1 = vec_sub(vecf1_1, zero_point_vec1); + Vectorized vf1_zp = { + vec_mul(scale_vec0, vec_substract_src_zp0_1), + vec_mul(scale_vec1, vec_substract_src_zp1_1)}; + + vfloat32 vec_substract_src_zp0_2 = vec_sub(vecf0_2, zero_point_vec0); + vfloat32 vec_substract_src_zp1_2 = vec_sub(vecf1_2, zero_point_vec1); + Vectorized vf2_zp = { + vec_mul(scale_vec0, vec_substract_src_zp0_2), + vec_mul(scale_vec1, vec_substract_src_zp1_2)}; + + vfloat32 vec_substract_src_zp0_3 = vec_sub(vecf0_3, zero_point_vec0); + vfloat32 vec_substract_src_zp1_3 = vec_sub(vecf1_3, zero_point_vec1); + Vectorized vf3_zp = { + vec_mul(scale_vec0, vec_substract_src_zp0_3), + vec_mul(scale_vec1, vec_substract_src_zp1_3)}; + + return {vf0_zp, vf1_zp, vf2_zp, vf3_zp}; + } + + float_vec_return_type C10_ALWAYS_INLINE + dequantize(Vectorized scale, Vectorized zero_point) const { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) vint16 vecshi0 = vec_unpackh(_vec0); vint16 vecshi1 = vec_unpackl(_vec0); @@ -290,7 +341,12 @@ struct Vectorized { return {vec0, vec1}; } +<<<<<<< HEAD Vectorized C10_ALWAYS_INLINE relu(Vectorized zero_point) const { +======= + Vectorized C10_ALWAYS_INLINE + relu(Vectorized zero_point) const { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return {vec_max(_vec0, zero_point._vec0), vec_max(_vec1, zero_point._vec1)}; } @@ -444,6 +500,7 @@ Vectorized inline minimum( } template <> +<<<<<<< HEAD Vectorized C10_ALWAYS_INLINE operator+(const Vectorized& a, const Vectorized& b) { return Vectorized{vec_add(a.vec0(), b.vec0()), vec_add(a.vec1(), b.vec1())}; } @@ -479,5 +536,55 @@ Vectorized C10_ALWAYS_INLINE operator^(const Vectorized& } } // namespace +======= +Vectorized C10_ALWAYS_INLINE +operator+(const Vectorized& a, const Vectorized& b) { + return Vectorized{ + vec_add(a.vec0(), b.vec0()), vec_add(a.vec1(), b.vec1())}; +} + +template <> +Vectorized C10_ALWAYS_INLINE +operator-(const Vectorized& a, const Vectorized& b) { + return Vectorized{ + vec_sub(a.vec0(), b.vec0()), vec_sub(a.vec1(), b.vec1())}; +} + +template <> +Vectorized C10_ALWAYS_INLINE +operator*(const Vectorized& a, const Vectorized& b) { + return Vectorized{ + vec_mul(a.vec0(), b.vec0()), vec_mul(a.vec1(), b.vec1())}; +} + +template <> +Vectorized C10_ALWAYS_INLINE +operator/(const Vectorized& a, const Vectorized& b) { + return Vectorized{a.vec0() / b.vec0(), a.vec1() / b.vec1()}; +} + +template <> +Vectorized C10_ALWAYS_INLINE +operator&(const Vectorized& a, const Vectorized& b) { + return Vectorized{ + vec_and(a.vec0(), b.vec0()), vec_and(a.vec1(), b.vec1())}; +} + +template <> +Vectorized C10_ALWAYS_INLINE +operator|(const Vectorized& a, const Vectorized& b) { + return Vectorized{ + vec_or(a.vec0(), b.vec0()), vec_or(a.vec1(), b.vec1())}; +} + +template <> +Vectorized C10_ALWAYS_INLINE +operator^(const Vectorized& a, const Vectorized& b) { + return Vectorized{ + vec_xor(a.vec0(), b.vec0()), vec_xor(a.vec1(), b.vec1())}; +} + +} // namespace CPU_CAPABILITY +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } // namespace vec } // namespace at diff --git a/aten/src/ATen/cpu/vec/vec256/vsx/vec256_quint8_vsx.h b/aten/src/ATen/cpu/vec/vec256/vsx/vec256_quint8_vsx.h index c0d77d500491..aa1089551bf5 100644 --- a/aten/src/ATen/cpu/vec/vec256/vsx/vec256_quint8_vsx.h +++ b/aten/src/ATen/cpu/vec/vec256/vsx/vec256_quint8_vsx.h @@ -1,8 +1,13 @@ #pragma once #include +<<<<<<< HEAD #include #include +======= +#include +#include +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #include #include @@ -26,13 +31,24 @@ // specified by float_vec_return_type. // // When writing kernels with these vectors, it is expected that floating- +<<<<<<< HEAD // point operations will be carried out in a loop over Vectorized::float_num_vecs // iterations. +======= +// point operations will be carried out in a loop over +// Vectorized::float_num_vecs iterations. +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) namespace at { namespace vec { inline namespace CPU_CAPABILITY { +<<<<<<< HEAD +======= +template <> +struct is_vec_specialized_for : std::bool_constant {}; + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) const vint16 mask_unsigned = vec_splats((short int)0xFF); template <> struct Vectorized { @@ -155,6 +171,7 @@ struct Vectorized { vfloat32 vecf1_3 = vec_float(veci7); vfloat32 scale_vec0 = scale.vec0(); vfloat32 scale_vec1 = scale.vec1(); +<<<<<<< HEAD vfloat32 scale_zp_premul0 = scale_zp_premul.vec0(); vfloat32 scale_zp_premul1 = scale_zp_premul.vec1(); return { @@ -175,6 +192,41 @@ struct Vectorized { float_vec_return_type C10_ALWAYS_INLINE dequantize( Vectorized scale, Vectorized zero_point) const { +======= + + vfloat32 zero_point_vec0 = zero_point.vec0(); + vfloat32 zero_point_vec1 = zero_point.vec1(); + + vfloat32 vec_substract_src_zp0_0 = vec_sub(vecf0_0, zero_point_vec0); + vfloat32 vec_substract_src_zp1_0 = vec_sub(vecf1_0, zero_point_vec1); + Vectorized vf0_zp = { + vec_mul(scale_vec0, vec_substract_src_zp0_0), + vec_mul(scale_vec1, vec_substract_src_zp1_0)}; + + vfloat32 vec_substract_src_zp0_1 = vec_sub(vecf0_1, zero_point_vec0); + vfloat32 vec_substract_src_zp1_1 = vec_sub(vecf1_1, zero_point_vec1); + Vectorized vf1_zp = { + vec_mul(scale_vec0, vec_substract_src_zp0_1), + vec_mul(scale_vec1, vec_substract_src_zp1_1)}; + + vfloat32 vec_substract_src_zp0_2 = vec_sub(vecf0_2, zero_point_vec0); + vfloat32 vec_substract_src_zp1_2 = vec_sub(vecf1_2, zero_point_vec1); + Vectorized vf2_zp = { + vec_mul(scale_vec0, vec_substract_src_zp0_2), + vec_mul(scale_vec1, vec_substract_src_zp1_2)}; + + vfloat32 vec_substract_src_zp0_3 = vec_sub(vecf0_3, zero_point_vec0); + vfloat32 vec_substract_src_zp1_3 = vec_sub(vecf1_3, zero_point_vec1); + Vectorized vf3_zp = { + vec_mul(scale_vec0, vec_substract_src_zp0_3), + vec_mul(scale_vec1, vec_substract_src_zp1_3)}; + + return {vf0_zp, vf1_zp, vf2_zp, vf3_zp}; + } + + float_vec_return_type C10_ALWAYS_INLINE + dequantize(Vectorized scale, Vectorized zero_point) const { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // unpacking unsigned as signed vint16 vecshi0 = vec_unpackh((vint8)_vec0); vint16 vecshi1 = vec_unpackl((vint8)_vec0); @@ -214,6 +266,10 @@ struct Vectorized { vfloat32 vecf1_3 = vec_float(veci7); vfloat32 scale_vec0 = scale.vec0(); vfloat32 scale_vec1 = scale.vec1(); +<<<<<<< HEAD +======= + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) vfloat32 zero_point0 = zero_point.vec0(); vfloat32 zero_point1 = zero_point.vec1(); return { @@ -298,12 +354,23 @@ struct Vectorized { return {vec0, vec1}; } +<<<<<<< HEAD Vectorized C10_ALWAYS_INLINE relu(Vectorized zero_point) const { return {vec_max(_vec0, zero_point._vec0), vec_max(_vec1, zero_point._vec1)}; } Vectorized C10_ALWAYS_INLINE relu6(Vectorized zero_point, Vectorized q_six) const { +======= + Vectorized C10_ALWAYS_INLINE + relu(Vectorized zero_point) const { + return {vec_max(_vec0, zero_point._vec0), vec_max(_vec1, zero_point._vec1)}; + } + + Vectorized C10_ALWAYS_INLINE relu6( + Vectorized zero_point, + Vectorized q_six) const { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) vuint8 max0 = vec_max(_vec0, zero_point._vec0); vuint8 max1 = vec_max(_vec1, zero_point._vec1); return {vec_min(max0, q_six._vec0), vec_min(max1, q_six._vec1)}; @@ -462,6 +529,7 @@ Vectorized inline minimum( } template <> +<<<<<<< HEAD Vectorized C10_ALWAYS_INLINE operator+(const Vectorized& a, const Vectorized& b) { return Vectorized{vec_add(a.vec0(), b.vec0()), vec_add(a.vec1(), b.vec1())}; } @@ -497,5 +565,55 @@ Vectorized C10_ALWAYS_INLINE operator^(const Vectorized C10_ALWAYS_INLINE +operator+(const Vectorized& a, const Vectorized& b) { + return Vectorized{ + vec_add(a.vec0(), b.vec0()), vec_add(a.vec1(), b.vec1())}; +} + +template <> +Vectorized C10_ALWAYS_INLINE +operator-(const Vectorized& a, const Vectorized& b) { + return Vectorized{ + vec_sub(a.vec0(), b.vec0()), vec_sub(a.vec1(), b.vec1())}; +} + +template <> +Vectorized C10_ALWAYS_INLINE +operator*(const Vectorized& a, const Vectorized& b) { + return Vectorized{ + vec_mul(a.vec0(), b.vec0()), vec_mul(a.vec1(), b.vec1())}; +} + +template <> +Vectorized C10_ALWAYS_INLINE +operator/(const Vectorized& a, const Vectorized& b) { + return Vectorized{a.vec0() / b.vec0(), a.vec1() / b.vec1()}; +} + +template <> +Vectorized C10_ALWAYS_INLINE +operator&(const Vectorized& a, const Vectorized& b) { + return Vectorized{ + vec_and(a.vec0(), b.vec0()), vec_and(a.vec1(), b.vec1())}; +} + +template <> +Vectorized C10_ALWAYS_INLINE +operator|(const Vectorized& a, const Vectorized& b) { + return Vectorized{ + vec_or(a.vec0(), b.vec0()), vec_or(a.vec1(), b.vec1())}; +} + +template <> +Vectorized C10_ALWAYS_INLINE +operator^(const Vectorized& a, const Vectorized& b) { + return Vectorized{ + vec_xor(a.vec0(), b.vec0()), vec_xor(a.vec1(), b.vec1())}; +} + +} // namespace CPU_CAPABILITY +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } // namespace vec } // namespace at diff --git a/aten/src/ATen/cpu/vec/vec256/vsx/vsx_helpers.h b/aten/src/ATen/cpu/vec/vec256/vsx/vsx_helpers.h index 1dc742f3cbb1..d8139e086c43 100644 --- a/aten/src/ATen/cpu/vec/vec256/vsx/vsx_helpers.h +++ b/aten/src/ATen/cpu/vec/vec256/vsx/vsx_helpers.h @@ -1,13 +1,20 @@ #pragma once +<<<<<<< HEAD #include #include #include +======= +#include +#include +#include +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #if defined(__clang__) typedef __vector __bool char vbool8; typedef __vector __bool short vbool16; typedef __vector __bool int vbool32; typedef __vector __bool long long vbool64; +<<<<<<< HEAD using vint8 = __attribute__((vector_size(16))) signed char; using vint16 = __attribute__((vector_size(16))) signed short; using vint32 = __attribute__((vector_size(16))) signed int; @@ -33,6 +40,37 @@ using vuint32 = __attribute__((altivec(vector__))) unsigned int; using vuint64 = __attribute__((altivec(vector__))) unsigned long long; using vfloat32 = __attribute__((altivec(vector__))) float; using vfloat64 = __attribute__((altivec(vector__))) double; +======= +using vint8 = __attribute__((vector_size(16))) signed char; +using vint16 = __attribute__((vector_size(16))) signed short; +using vint32 = __attribute__((vector_size(16))) signed int; +using vint64 = __attribute__((vector_size(16))) signed long long; +using vuint8 = __attribute__((vector_size(16))) unsigned char; +using vuint16 = __attribute__((vector_size(16))) unsigned short; +using vuint32 = __attribute__((vector_size(16))) unsigned int; +using vuint64 = __attribute__((vector_size(16))) unsigned long long; +using vfloat32 = __attribute__((vector_size(16))) float; +using vfloat64 = __attribute__((vector_size(16))) double; +#else +using vbool8 = + __attribute__((altivec(vector__))) __attribute__((altivec(bool__))) char; +using vbool16 = + __attribute__((altivec(vector__))) __attribute__((altivec(bool__))) short; +using vbool32 = + __attribute__((altivec(vector__))) __attribute__((altivec(bool__))) int; +using vbool64 = __attribute__((altivec(vector__))) +__attribute__((altivec(bool__))) long long; +using vint8 = __attribute__((altivec(vector__))) signed char; +using vint16 = __attribute__((altivec(vector__))) signed short; +using vint32 = __attribute__((altivec(vector__))) signed int; +using vint64 = __attribute__((altivec(vector__))) signed long long; +using vuint8 = __attribute__((altivec(vector__))) unsigned char; +using vuint16 = __attribute__((altivec(vector__))) unsigned short; +using vuint32 = __attribute__((altivec(vector__))) unsigned int; +using vuint64 = __attribute__((altivec(vector__))) unsigned long long; +using vfloat32 = __attribute__((altivec(vector__))) float; +using vfloat64 = __attribute__((altivec(vector__))) double; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #endif #if !defined(vec_float) @@ -71,7 +109,11 @@ C10_ALWAYS_INLINE vfloat64 vec_neg(const vfloat64& vec_in) { } C10_ALWAYS_INLINE vint16 vec_neg(const vint16& vec_in) { +<<<<<<< HEAD vint16 vint0 = {0, 0, 0, 0 ,0, 0, 0, 0}; +======= + vint16 vint0 = {0, 0, 0, 0, 0, 0, 0, 0}; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return vec_vsubuhm(vint0, vec_in); } @@ -116,6 +158,7 @@ C10_ALWAYS_INLINE T vec_max_nan(const T& a, const T& b) { } // Specializations for float/double taken from Eigen +<<<<<<< HEAD template<> C10_ALWAYS_INLINE vfloat32 vec_min_nan(const vfloat32& a, const vfloat32& b) { @@ -148,6 +191,52 @@ C10_ALWAYS_INLINE vfloat64 vec_max_nan(const vfloat64& a, const vfloat // NOTE: about 10% slower than vec_max, but consistent with std::max and SSE regarding NaN vfloat64 ret; __asm__ ("xvcmpgtdp %x0,%x2,%x1\n\txxsel %x0,%x1,%x2,%x0" : "=&wa" (ret) : "wa" (a), "wa" (b)); +======= +template <> +C10_ALWAYS_INLINE vfloat32 +vec_min_nan(const vfloat32& a, const vfloat32& b) { + // NOTE: about 10% slower than vec_min, but consistent with std::min and SSE + // regarding NaN + vfloat32 ret; + __asm__("xvcmpgesp %x0,%x1,%x2\n\txxsel %x0,%x1,%x2,%x0" + : "=&wa"(ret) + : "wa"(a), "wa"(b)); + return ret; +} +// Specializations for float/double taken from Eigen +template <> +C10_ALWAYS_INLINE vfloat32 +vec_max_nan(const vfloat32& a, const vfloat32& b) { + // NOTE: about 10% slower than vec_max, but consistent with std::min and SSE + // regarding NaN + vfloat32 ret; + __asm__("xvcmpgtsp %x0,%x2,%x1\n\txxsel %x0,%x1,%x2,%x0" + : "=&wa"(ret) + : "wa"(a), "wa"(b)); + return ret; +} + +template <> +C10_ALWAYS_INLINE vfloat64 +vec_min_nan(const vfloat64& a, const vfloat64& b) { + // NOTE: about 10% slower than vec_min, but consistent with std::min and SSE + // regarding NaN + vfloat64 ret; + __asm__("xvcmpgedp %x0,%x1,%x2\n\txxsel %x0,%x1,%x2,%x0" + : "=&wa"(ret) + : "wa"(a), "wa"(b)); + return ret; +} +template <> +C10_ALWAYS_INLINE vfloat64 +vec_max_nan(const vfloat64& a, const vfloat64& b) { + // NOTE: about 10% slower than vec_max, but consistent with std::max and SSE + // regarding NaN + vfloat64 ret; + __asm__("xvcmpgtdp %x0,%x2,%x1\n\txxsel %x0,%x1,%x2,%x0" + : "=&wa"(ret) + : "wa"(a), "wa"(b)); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return ret; } @@ -168,18 +257,29 @@ C10_VSX_VEC_NAN_PROPAG(vec_max_nan2, vfloat64, vbool64, vec_max) #undef C10_VSX_VEC_NAN_PROPAG +<<<<<<< HEAD #define DEFINE_MEMBER_UNARY_OP(op, op_type, func) \ +======= +#define DEFINE_MEMBER_UNARY_OP(op, op_type, func) \ +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) Vectorized C10_ALWAYS_INLINE op() const { \ return Vectorized{func(_vec0), func(_vec1)}; \ } #define DEFINE_MEMBER_OP(op, op_type, func) \ +<<<<<<< HEAD Vectorized C10_ALWAYS_INLINE op(const Vectorized& other) const { \ return Vectorized{ \ +======= + Vectorized C10_ALWAYS_INLINE op(const Vectorized& other) \ + const { \ + return Vectorized{ \ +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) func(_vec0, other._vec0), func(_vec1, other._vec1)}; \ } #define DEFINE_MEMBER_BITWISE_OP(op, op_type, func) \ +<<<<<<< HEAD Vectorized C10_ALWAYS_INLINE op(const Vectorized& other) const { \ return Vectorized{ \ func(_vecb0, other._vecb0), func(_vecb1, other._vecb1)}; \ @@ -256,12 +356,101 @@ C10_VSX_VEC_NAN_PROPAG(vec_max_nan2, vfloat64, vbool64, vec_max) // it can be used to emulate blend faster constexpr int blendChoice(uint32_t mask, uint32_t half1 = 0xF, uint32_t half2 = 0xF0) { +======= + Vectorized C10_ALWAYS_INLINE op(const Vectorized& other) \ + const { \ + return Vectorized{ \ + func(_vecb0, other._vecb0), func(_vecb1, other._vecb1)}; \ + } + +#define DEFINE_MEMBER_TERNARY_OP(op, op_type, func) \ + Vectorized C10_ALWAYS_INLINE op( \ + const Vectorized& b, const Vectorized& c) const { \ + return Vectorized{ \ + func(_vec0, b._vec0, c._vec0), func(_vec1, b._vec1, c._vec1)}; \ + } + +#define DEFINE_MEMBER_EMULATE_BINARY_OP(op, op_type, binary_op) \ + Vectorized C10_ALWAYS_INLINE op(const Vectorized& b) \ + const { \ + Vectorized::vec_internal_type ret_0; \ + Vectorized::vec_internal_type ret_1; \ + for (int i = 0; i < Vectorized::size() / 2; i++) { \ + ret_0[i] = _vec0[i] binary_op b._vec0[i]; \ + ret_1[i] = _vec1[i] binary_op b._vec1[i]; \ + } \ + return Vectorized{ret_0, ret_1}; \ + } + +#define DEFINE_MEMBER_OP_AND_ONE(op, op_type, func) \ + Vectorized C10_ALWAYS_INLINE op(const Vectorized& other) \ + const { \ + using vvtype = Vectorized::vec_internal_type; \ + const vvtype v_one = vec_splats(static_cast(1.0)); \ + vvtype ret0 = (vvtype)func(_vec0, other._vec0); \ + vvtype ret1 = (vvtype)func(_vec1, other._vec1); \ + return Vectorized{vec_and(ret0, v_one), vec_and(ret1, v_one)}; \ + } + +#define DEFINE_CLAMP_FUNCS(operand_type) \ + template <> \ + Vectorized C10_ALWAYS_INLINE clamp( \ + const Vectorized& a, \ + const Vectorized& min, \ + const Vectorized& max) { \ + return Vectorized{ \ + vec_min_nan(vec_max_nan(a.vec0(), min.vec0()), max.vec0()), \ + vec_min_nan(vec_max_nan(a.vec1(), min.vec1()), max.vec1())}; \ + } \ + template <> \ + Vectorized C10_ALWAYS_INLINE clamp_min( \ + const Vectorized& a, \ + const Vectorized& min) { \ + return Vectorized{ \ + vec_max_nan(a.vec0(), min.vec0()), vec_max_nan(a.vec1(), min.vec1())}; \ + } \ + template <> \ + Vectorized C10_ALWAYS_INLINE clamp_max( \ + const Vectorized& a, \ + const Vectorized& max) { \ + return Vectorized{ \ + vec_min_nan(a.vec0(), max.vec0()), vec_min_nan(a.vec1(), max.vec1())}; \ + } + +#define DEFINE_REINTERPRET_CAST_FUNCS( \ + first_type, cast_type, cast_inner_vector_type) \ + template <> \ + C10_ALWAYS_INLINE Vectorized cast( \ + const Vectorized& src) { \ + return Vectorized{ \ + (cast_inner_vector_type)src.vec0(), \ + (cast_inner_vector_type)src.vec1()}; \ + } + +#define DEFINE_REINTERPRET_CAST_TO_ALL_FUNCS(first_type) \ + DEFINE_REINTERPRET_CAST_FUNCS(first_type, double, vfloat64) \ + DEFINE_REINTERPRET_CAST_FUNCS(first_type, float, vfloat32) \ + DEFINE_REINTERPRET_CAST_FUNCS(first_type, int64_t, vint64) \ + DEFINE_REINTERPRET_CAST_FUNCS(first_type, int32_t, vint32) \ + DEFINE_REINTERPRET_CAST_FUNCS(first_type, int16_t, vint16) + +// it can be used to emulate blend faster +constexpr int blendChoice( + uint32_t mask, + uint32_t half1 = 0xF, + uint32_t half2 = 0xF0) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) uint32_t none = 0; uint32_t both = half1 | half2; // clamp it between 0 and both mask = mask & both; // return (a._vec0, a._vec1) +<<<<<<< HEAD if (mask == none) return 0; +======= + if (mask == none) + return 0; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // return (b._vec0,b._vec1) else if (mask == both) return 1; @@ -320,18 +509,36 @@ constexpr vbool64 VsxDblMask2(uint32_t mask) { constexpr int maskForComplex(uint32_t mask) { mask = mask & 0xF; int complex_mask = 0; +<<<<<<< HEAD if (mask & 1) complex_mask |= 3; if (mask & 2) complex_mask |= (3 << 2); if (mask & 4) complex_mask |= (3 << 4); if (mask & 8) complex_mask |= (3 << 6); +======= + if (mask & 1) + complex_mask |= 3; + if (mask & 2) + complex_mask |= (3 << 2); + if (mask & 4) + complex_mask |= (3 << 4); + if (mask & 8) + complex_mask |= (3 << 6); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return complex_mask; } constexpr int maskForComplexDbl(uint32_t mask) { mask = mask & 0x3; int complex_mask = 0; +<<<<<<< HEAD if (mask & 1) complex_mask |= 3; if (mask & 2) complex_mask |= (3 << 2); +======= + if (mask & 1) + complex_mask |= 3; + if (mask & 2) + complex_mask |= (3 << 2); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return complex_mask; } @@ -352,7 +559,13 @@ constexpr vbool32 VsxComplexMask2(uint32_t mask) { return VsxMask1(maskForComplex(mask2)); } +<<<<<<< HEAD constexpr vbool64 VsxComplexDblMask1(uint32_t mask) { return VsxDblMask1(mask); } +======= +constexpr vbool64 VsxComplexDblMask1(uint32_t mask) { + return VsxDblMask1(mask); +} +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) constexpr vbool64 VsxComplexDblMask2(uint32_t mask) { uint32_t mask2 = (mask & 0xF) >> 2; @@ -369,8 +582,28 @@ constexpr int offset0 = 0; constexpr int offset16 = 16; // #Constants +<<<<<<< HEAD const vuint8 mask_zero_bits = vuint8{128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 96, 64, 32, 0}; +======= +const vuint8 mask_zero_bits = vuint8{ + 128, + 128, + 128, + 128, + 128, + 128, + 128, + 128, + 128, + 128, + 128, + 128, + 96, + 64, + 32, + 0}; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) const vuint8 swap_mask = vuint8{4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11}; @@ -391,9 +624,15 @@ const vbool32 imag_mask = vbool32{0x0, 0xFFFFFFFF, 0x0, 0xFFFFFFFF}; const vbool32 isign_mask = vbool32{0x0, 0x80000000, 0x0, 0x80000000}; const vbool32 rsign_mask = vbool32{0x80000000, 0x0, 0x80000000, 0x0}; +<<<<<<< HEAD const vbool64 vd_sign_mask = vbool64{0x8000000000000000, 0x8000000000000000}; const vbool64 vd_imag_mask = vbool64{0x0, 0xFFFFFFFFFFFFFFFF}; const vbool64 vd_real_mask = vbool64{0xFFFFFFFFFFFFFFFF, 0x0}; +======= +const vbool64 vd_sign_mask = vbool64{0x8000000000000000, 0x8000000000000000}; +const vbool64 vd_imag_mask = vbool64{0x0, 0xFFFFFFFFFFFFFFFF}; +const vbool64 vd_real_mask = vbool64{0xFFFFFFFFFFFFFFFF, 0x0}; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) const vbool64 vd_isign_mask = vbool64{0x0, 0x8000000000000000}; const vbool64 vd_rsign_mask = vbool64{0x8000000000000000, 0x0}; @@ -403,7 +642,12 @@ const vfloat32 one = vec_splats(1.f); const vfloat32 two = vec_splats(2.0f); const vfloat32 _4div_pi = vec_splats(1.27323954473516f); const vfloat32 v_inf = (vfloat32)vec_splats(0x7f800000u); +<<<<<<< HEAD const vfloat32 v_minus_inf = vfloat32{ 0xff800000u, 0xff800000u, 0xff800000u, 0xff800000u }; +======= +const vfloat32 v_minus_inf = + vfloat32{0xff800000u, 0xff800000u, 0xff800000u, 0xff800000u}; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) const vfloat32 v_nan = (vfloat32)vec_splats(0x7fffffff); const vfloat32 log10e_inv = vec_splats(0.43429448190325176f); const vfloat32 log2e_inv = vec_splats(1.4426950408889634f); @@ -432,7 +676,12 @@ const vfloat32 log_p8 = vec_splats(+3.3333331174E-1f); const vfloat32 log_q1 = vec_splats(-2.12194440e-4f); const vfloat32 log_q2 = vec_splats(0.693359375f); const vfloat32 max_logf = vec_splats(88.02969187150841f); +<<<<<<< HEAD const vfloat32 max_numf = vec_splats(1.7014117331926442990585209174225846272e38f); +======= +const vfloat32 max_numf = + vec_splats(1.7014117331926442990585209174225846272e38f); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) const vfloat32 min_inf = (vfloat32)vec_splats(0xff800000u); const vfloat32 min_norm_pos = (vfloat32)vec_splats(0x0800000u); const vfloat32 minus_cephes_dp1 = vec_splats(-0.78515625f); @@ -456,8 +705,16 @@ const vfloat32 tanh_p4 = vec_splats(-3.33332819422E-1f); const vfloat32 vcheck = vec_splats((float)(1LL << 24)); const vfloat32 imag_one = vfloat32{0.f, 1.f, 0.f, 1.f}; const vfloat32 imag_half = vfloat32{0.f, 0.5f, 0.f, 0.5f}; +<<<<<<< HEAD const vfloat32 sqrt2_2 = vfloat32{0.70710676908493042f, 0.70710676908493042, 0.70710676908493042, 0.70710676908493042}; +======= +const vfloat32 sqrt2_2 = vfloat32{ + 0.70710676908493042f, + 0.70710676908493042, + 0.70710676908493042, + 0.70710676908493042}; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) const vfloat32 pi_2 = vfloat32{M_PI / 2, 0.0, M_PI / 2, 0.0}; const vfloat32 vf_89 = vfloat32{89.f, 89.f, 89.f, 89.f}; const vfloat64 vd_one = vec_splats(1.0); @@ -469,6 +726,10 @@ const vfloat64 vd_imag_half = vfloat64{0.0, 0.5}; const vfloat64 vd_sqrt2_2 = vfloat64{0.70710678118654757, 0.70710678118654757}; const vfloat64 vd_pi_2 = vfloat64{M_PI / 2.0, 0.0}; +<<<<<<< HEAD } // namespace +======= +} // namespace CPU_CAPABILITY +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } // namespace vec } // namespace at diff --git a/aten/src/ATen/cpu/vec/vec256/zarch/vec256_zarch.h b/aten/src/ATen/cpu/vec/vec256/zarch/vec256_zarch.h index 7c2932b3aab7..87fa0477bd7a 100644 --- a/aten/src/ATen/cpu/vec/vec256/zarch/vec256_zarch.h +++ b/aten/src/ATen/cpu/vec/vec256/zarch/vec256_zarch.h @@ -31,8 +31,12 @@ constexpr bool is_zarch_implemented() { template constexpr bool is_zarch_implemented_quant() { return ( +<<<<<<< HEAD std::is_same_v || std::is_same_v || +======= + std::is_same_v || std::is_same_v || +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) std::is_same_v); } @@ -364,6 +368,13 @@ constexpr auto GetSwapMaskFloat() { } template +<<<<<<< HEAD +======= +struct is_vec_specialized_for()>> + : std::bool_constant {}; + +template +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) struct Vectorized()>> { public: using value_type = T; @@ -386,7 +397,12 @@ struct Vectorized()>> { Vectorized() {} C10_ALWAYS_INLINE Vectorized(vtype v) : _vec0{v}, _vec1{v} {} +<<<<<<< HEAD C10_ALWAYS_INLINE Vectorized(const vinner_data &v) : _vec0{v.first}, _vec1{v.second} {} +======= + C10_ALWAYS_INLINE Vectorized(const vinner_data& v) + : _vec0{v.first}, _vec1{v.second} {} +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) C10_ALWAYS_INLINE Vectorized(vtype v1, vtype v2) : _vec0{v1}, _vec1{v2} {} C10_ALWAYS_INLINE Vectorized(T s) : _vec0{vec_splats((ElementType)s)}, _vec1{vec_splats((ElementType)s)} {} @@ -396,7 +412,12 @@ struct Vectorized()>> { static Vectorized C10_ALWAYS_INLINE loadu(const U* ptr, int count = size()) { __at_align__ ElementType tmp_values[size()] = {}; +<<<<<<< HEAD std::memcpy(tmp_values, ptr, std::min(count, size()) * sizeof(ElementType)); +======= + std::memcpy( + tmp_values, ptr, std::min(count, size()) * sizeof(ElementType)); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return { vec_xl(offset0, &(tmp_values[0])), @@ -409,6 +430,7 @@ struct Vectorized()>> { static Vectorized C10_ALWAYS_INLINE loadu(const ElementType* ptr, int count = size()) { if (count == size()) { +<<<<<<< HEAD return { vec_xl(offset0, ptr), vec_xl(offset16, ptr)}; @@ -416,6 +438,14 @@ struct Vectorized()>> { __at_align__ ElementType tmp_values[size()] = {}; std::memcpy(tmp_values, ptr, std::min(count, size()) * sizeof(ElementType)); +======= + return {vec_xl(offset0, ptr), vec_xl(offset16, ptr)}; + } + + __at_align__ ElementType tmp_values[size()] = {}; + std::memcpy( + tmp_values, ptr, std::min(count, size()) * sizeof(ElementType)); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return { vec_xl(offset0, &(tmp_values[0])), @@ -430,8 +460,12 @@ struct Vectorized()>> { } template +<<<<<<< HEAD static Vectorized C10_ALWAYS_INLINE loadu_one_fourth(const U* ptr) { +======= + static Vectorized C10_ALWAYS_INLINE loadu_one_fourth(const U* ptr) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // load only first 8 bytes // only intended to be used with uint8_t return loadu(ptr, 8 / sizeof(ElementType)); @@ -439,7 +473,12 @@ struct Vectorized()>> { template struct StoreHelper { +<<<<<<< HEAD static void C10_ALWAYS_INLINE store(const Vectorized &vec, U* ptr, int count = size()) { +======= + static void C10_ALWAYS_INLINE + store(const Vectorized& vec, U* ptr, int count = size()) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if (count > 0) { __at_align__ ElementType tmp_values[size()]; vec_xst(vec._vec0, offset0, &(tmp_values[0])); @@ -452,7 +491,12 @@ struct Vectorized()>> { template struct StoreHelper { +<<<<<<< HEAD static void C10_ALWAYS_INLINE store(const Vectorized &vec, ElementType* ptr, int count = size()) { +======= + static void C10_ALWAYS_INLINE + store(const Vectorized& vec, ElementType* ptr, int count = size()) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if (count == size()) { vec_xst(vec._vec0, offset0, ptr); vec_xst(vec._vec1, offset16, ptr); @@ -788,16 +832,24 @@ struct Vectorized()>> { return (*this <= other) & Vectorized((T)1.0); } +<<<<<<< HEAD template < typename U = T, std::enable_if_t, int> = 0> +======= + template , int> = 0> +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) Vectorized C10_ALWAYS_INLINE abs() const { return {vec_abs(_vec0), vec_abs(_vec1)}; } +<<<<<<< HEAD template < typename U = T, std::enable_if_t, int> = 0> +======= + template , int> = 0> +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) Vectorized C10_ALWAYS_INLINE abs() const { return {_vec0, _vec1}; } @@ -813,6 +865,7 @@ struct Vectorized()>> { } bool has_inf_nan() const { +<<<<<<< HEAD for (const auto i : c10::irange(size()/2)) { if(_isnan(_vec0[i]) || _isinf(_vec0[i])) { return true; @@ -820,6 +873,15 @@ struct Vectorized()>> { } for (const auto i : c10::irange(size()/2)) { if(_isnan(_vec1[i]) || _isinf(_vec1[i])) { +======= + for (const auto i : c10::irange(size() / 2)) { + if (_isnan(_vec0[i]) || _isinf(_vec0[i])) { + return true; + } + } + for (const auto i : c10::irange(size() / 2)) { + if (_isnan(_vec1[i]) || _isinf(_vec1[i])) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return true; } } @@ -900,9 +962,13 @@ struct Vectorized()>> { return sqrt().reciprocal(); } +<<<<<<< HEAD template < typename U = T, std::enable_if_t, int> = 0> +======= + template , int> = 0> +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) inline Vectorized mapOrdinary(float (*const f)(float)) const { float a00 = f(_vec0[0]); float a01 = f(_vec0[1]); @@ -922,9 +988,13 @@ struct Vectorized()>> { return Vectorized(f(_vec0[0]), f(_vec0[1]), f(_vec1[0]), f(_vec1[1])); } +<<<<<<< HEAD template < typename U = T, std::enable_if_t, int> = 0> +======= + template , int> = 0> +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) inline Vectorized mapOrdinary( float (*const f)(float, float), const Vectorized& b) const { @@ -1122,7 +1192,12 @@ struct Vectorized()>> { typename U = T, std::enable_if_t, int> = 0> Vectorized minimum(const Vectorized& other) const { +<<<<<<< HEAD Vectorized tmp = {vec_min(_vec0, other._vec0), vec_min(_vec1, other._vec1)}; +======= + Vectorized tmp = { + vec_min(_vec0, other._vec0), vec_min(_vec1, other._vec1)}; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) tmp = blendv(tmp, *this, isnan()); return blendv(tmp, other, other.isnan()); } @@ -1139,7 +1214,12 @@ struct Vectorized()>> { typename U = T, std::enable_if_t, int> = 0> Vectorized maximum(const Vectorized& other) const { +<<<<<<< HEAD Vectorized tmp = {vec_max(_vec0, other._vec0), vec_max(_vec1, other._vec1)}; +======= + Vectorized tmp = { + vec_max(_vec0, other._vec0), vec_max(_vec1, other._vec1)}; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) tmp = blendv(tmp, *this, isnan()); return blendv(tmp, other, other.isnan()); } @@ -1176,9 +1256,13 @@ struct Vectorized()>> { return blendv(tmp, *this, isnan()); } +<<<<<<< HEAD template < typename U = T, std::enable_if_t, int> = 0> +======= + template , int> = 0> +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) Vectorized swapped() const { auto swap_mask = GetSwapMaskFloat(); vtype v0 = vec_perm(_vec0, _vec0, swap_mask); @@ -1260,6 +1344,7 @@ struct Vectorized()>> { std::enable_if_t, int> = 0> Vectorized to_vec_float_helper() const { int32_t values[8] = { +<<<<<<< HEAD _vec0[0], _vec0[1], _vec0[2], @@ -1274,6 +1359,27 @@ struct Vectorized()>> { values[0], values[1], values[2], values[3], values[4], values[5], values[6], values[7] }; +======= + _vec0[0], + _vec0[1], + _vec0[2], + _vec0[3], + _vec0[4], + _vec0[5], + _vec0[6], + _vec0[7], + }; + + return Vectorized{ + values[0], + values[1], + values[2], + values[3], + values[4], + values[5], + values[6], + values[7]}; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } template < @@ -1282,6 +1388,7 @@ struct Vectorized()>> { Vectorized to_vec_uint8_helper() const { // helper function for float to uint8_t conversion uint8_t values[8] = { +<<<<<<< HEAD static_cast(_vec0[0]), static_cast(_vec0[1]), static_cast(_vec0[2]), @@ -1301,10 +1408,30 @@ struct Vectorized()>> { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +======= + static_cast(_vec0[0]), + static_cast(_vec0[1]), + static_cast(_vec0[2]), + static_cast(_vec0[3]), + static_cast(_vec1[0]), + static_cast(_vec1[1]), + static_cast(_vec1[2]), + static_cast(_vec1[3]), + }; + + return Vectorized{ + values[0], values[1], values[2], values[3], values[4], values[5], + values[6], values[7], 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, + 0, 0, +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) }; } }; +<<<<<<< HEAD #define ZVECTOR_OPERATORS(typex) \ template <> \ Vectorized C10_ALWAYS_INLINE operator+(const Vectorized& a, const Vectorized& b) { \ @@ -1376,6 +1503,92 @@ struct Vectorized()>> { Vectorized C10_ALWAYS_INLINE operator<=(const Vectorized& a, const Vectorized& b) { \ return Vectorized{ \ vec_cmple(a.vec0(), b.vec0()), vec_cmple(a.vec1(), b.vec1())}; \ +======= +#define ZVECTOR_OPERATORS(typex) \ + template <> \ + Vectorized C10_ALWAYS_INLINE operator+( \ + const Vectorized& a, const Vectorized& b) { \ + return Vectorized{a.vec0() + b.vec0(), a.vec1() + b.vec1()}; \ + } \ + \ + template <> \ + Vectorized C10_ALWAYS_INLINE operator-( \ + const Vectorized& a, const Vectorized& b) { \ + return Vectorized{a.vec0() - b.vec0(), a.vec1() - b.vec1()}; \ + } \ + \ + template <> \ + Vectorized C10_ALWAYS_INLINE operator*( \ + const Vectorized& a, const Vectorized& b) { \ + return Vectorized{a.vec0() * b.vec0(), a.vec1() * b.vec1()}; \ + } \ + \ + template <> \ + Vectorized C10_ALWAYS_INLINE operator/( \ + const Vectorized& a, const Vectorized& b) { \ + return Vectorized{a.vec0() / b.vec0(), a.vec1() / b.vec1()}; \ + } \ + \ + template <> \ + Vectorized C10_ALWAYS_INLINE operator&( \ + const Vectorized& a, const Vectorized& b) { \ + return Vectorized{ \ + (Vectorized::vtype)(a.vecb0() & b.vecb0()), \ + (Vectorized::vtype)(a.vecb1() & b.vecb1())}; \ + } \ + \ + template <> \ + Vectorized C10_ALWAYS_INLINE operator|( \ + const Vectorized& a, const Vectorized& b) { \ + return Vectorized{ \ + (Vectorized::vtype)(a.vecb0() | b.vecb0()), \ + (Vectorized::vtype)(a.vecb1() | b.vecb1())}; \ + } \ + \ + template <> \ + Vectorized C10_ALWAYS_INLINE operator^( \ + const Vectorized& a, const Vectorized& b) { \ + return Vectorized{ \ + (Vectorized::vtype)(a.vecb0() ^ b.vecb0()), \ + (Vectorized::vtype)(a.vecb1() ^ b.vecb1())}; \ + } \ + \ + Vectorized C10_ALWAYS_INLINE operator==( \ + const Vectorized& a, const Vectorized& b) { \ + return Vectorized{ \ + vec_cmpeq(a.vec0(), b.vec0()), vec_cmpeq(a.vec1(), b.vec1())}; \ + } \ + \ + Vectorized C10_ALWAYS_INLINE operator!=( \ + const Vectorized& a, const Vectorized& b) { \ + return Vectorized{ \ + vec_cmpeq(a.vec0(), b.vec0()), vec_cmpeq(a.vec1(), b.vec1())} \ + ._not(); \ + } \ + \ + Vectorized C10_ALWAYS_INLINE operator>( \ + const Vectorized& a, const Vectorized& b) { \ + return Vectorized{ \ + vec_cmpgt(a.vec0(), b.vec0()), vec_cmpgt(a.vec1(), b.vec1())}; \ + } \ + \ + Vectorized C10_ALWAYS_INLINE operator>=( \ + const Vectorized& a, const Vectorized& b) { \ + return Vectorized{ \ + vec_cmpge(a.vec0(), b.vec0()), vec_cmpge(a.vec1(), b.vec1())}; \ + } \ + \ + Vectorized C10_ALWAYS_INLINE operator<( \ + const Vectorized& a, const Vectorized& b) { \ + return Vectorized{ \ + vec_cmplt(a.vec0(), b.vec0()), vec_cmplt(a.vec1(), b.vec1())}; \ + } \ + \ + Vectorized C10_ALWAYS_INLINE operator<=( \ + const Vectorized& a, const Vectorized& b) { \ + return Vectorized{ \ + vec_cmple(a.vec0(), b.vec0()), vec_cmple(a.vec1(), b.vec1())}; \ +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } ZVECTOR_OPERATORS(float) @@ -1389,6 +1602,7 @@ ZVECTOR_OPERATORS(int64_t) #undef ZVECTOR_OPERATORS +<<<<<<< HEAD #define ZVECTOR_OPERATORS(typex) \ template <> \ Vectorized C10_ALWAYS_INLINE operator<<(const Vectorized& a, const Vectorized& b) { \ @@ -1442,6 +1656,67 @@ ZVECTOR_OPERATORS(int64_t) template <> \ inline Vectorized operator~(const Vectorized& a) { \ return a._not(); \ +======= +#define ZVECTOR_OPERATORS(typex) \ + template <> \ + Vectorized C10_ALWAYS_INLINE operator<<( \ + const Vectorized& a, const Vectorized& b) { \ + constexpr Vectorized::ElementType max_shift = \ + sizeof(Vectorized::ElementType) * CHAR_BIT; \ + \ + Vectorized::ElementType a_array[Vectorized::size()]; \ + Vectorized::ElementType b_array[Vectorized::size()]; \ + Vectorized::ElementType c_array[Vectorized::size()]; \ + \ + a.store(a_array); \ + b.store(b_array); \ + \ + for (int i = 0; i != Vectorized::size(); i++) { \ + typex shift = b_array[i]; \ + if ((static_cast>(shift) < 0) || \ + (shift >= max_shift)) { \ + c_array[i] = 0; \ + } else { \ + c_array[i] = static_cast>(a_array[i]) \ + << shift; \ + } \ + } \ + \ + return Vectorized::loadu(c_array); \ + } \ + \ + template <> \ + Vectorized C10_ALWAYS_INLINE operator>>( \ + const Vectorized& a, const Vectorized& b) { \ + /* right shift value to retain sign bit for signed and no bits for \ + * unsigned */ \ + constexpr Vectorized::ElementType max_shift = \ + sizeof(typex) * CHAR_BIT - std::is_signed_v; \ + \ + Vectorized::ElementType a_array[Vectorized::size()]; \ + Vectorized::ElementType b_array[Vectorized::size()]; \ + Vectorized::ElementType c_array[Vectorized::size()]; \ + \ + a.store(a_array); \ + b.store(b_array); \ + \ + for (int i = 0; i != Vectorized::size(); i++) { \ + typex shift = b_array[i]; \ + if ((static_cast>(shift) < 0) || \ + (shift >= max_shift)) { \ + c_array[i] = a_array[i] >> max_shift; \ + } else { \ + c_array[i] = a_array[i] >> shift; \ + } \ + } \ + \ + return Vectorized::loadu(c_array); \ + } \ + \ + template <> \ + inline Vectorized operator~(const Vectorized& a) { \ + return a._not(); \ +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } ZVECTOR_OPERATORS(int8_t) @@ -1727,6 +2002,15 @@ C10_DIAGNOSTIC_POP() //////////////////////////////////QUANT/////////////////////////////////////////// template +<<<<<<< HEAD +======= +struct is_vec_specialized_for< + T, + std::enable_if_t()>> + : std::bool_constant {}; + +template +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) struct Vectorized()>> { public: using value_type = typename T::underlying; @@ -1906,7 +2190,11 @@ struct Vectorized()>> { (vecf_0 - zero_point) * scale, (vecf_1 - zero_point) * scale, (vecf_2 - zero_point) * scale, +<<<<<<< HEAD (vecf_3 - zero_point) * scale }; +======= + (vecf_3 - zero_point) * scale}; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } template < @@ -2034,6 +2322,7 @@ struct Vectorized()>> { } }; +<<<<<<< HEAD #define ZVECTOR_OPERATORS(typex) \ template <> \ Vectorized C10_ALWAYS_INLINE operator+(const Vectorized& a, const Vectorized& b) { \ @@ -2092,6 +2381,79 @@ struct Vectorized()>> { \ Vectorized C10_ALWAYS_INLINE operator<=(const Vectorized& a, const Vectorized& b) { \ return Vectorized{a.vec() <= b.vec()}; \ +======= +#define ZVECTOR_OPERATORS(typex) \ + template <> \ + Vectorized C10_ALWAYS_INLINE operator+( \ + const Vectorized& a, const Vectorized& b) { \ + return Vectorized{a.vec() + b.vec()}; \ + } \ + \ + template <> \ + Vectorized C10_ALWAYS_INLINE operator-( \ + const Vectorized& a, const Vectorized& b) { \ + return Vectorized{a.vec() - b.vec()}; \ + } \ + \ + template <> \ + Vectorized C10_ALWAYS_INLINE operator*( \ + const Vectorized& a, const Vectorized& b) { \ + return Vectorized{a.vec() * b.vec()}; \ + } \ + \ + template <> \ + Vectorized C10_ALWAYS_INLINE operator/( \ + const Vectorized& a, const Vectorized& b) { \ + return Vectorized{a.vec() / b.vec()}; \ + } \ + \ + template <> \ + Vectorized C10_ALWAYS_INLINE operator&( \ + const Vectorized& a, const Vectorized& b) { \ + return Vectorized{a.vec() & b.vec()}; \ + } \ + \ + template <> \ + Vectorized C10_ALWAYS_INLINE operator|( \ + const Vectorized& a, const Vectorized& b) { \ + return Vectorized{a.vec() | b.vec()}; \ + } \ + \ + template <> \ + Vectorized C10_ALWAYS_INLINE operator^( \ + const Vectorized& a, const Vectorized& b) { \ + return Vectorized{a.vec() ^ b.vec()}; \ + } \ + \ + Vectorized C10_ALWAYS_INLINE operator==( \ + const Vectorized& a, const Vectorized& b) { \ + return Vectorized{a.vec() == b.vec()}; \ + } \ + \ + Vectorized C10_ALWAYS_INLINE operator!=( \ + const Vectorized& a, const Vectorized& b) { \ + return Vectorized{a.vec() != b.vec()}; \ + } \ + \ + Vectorized C10_ALWAYS_INLINE operator>( \ + const Vectorized& a, const Vectorized& b) { \ + return Vectorized{a.vec() > b.vec()}; \ + } \ + \ + Vectorized C10_ALWAYS_INLINE operator>=( \ + const Vectorized& a, const Vectorized& b) { \ + return Vectorized{a.vec() >= b.vec()}; \ + } \ + \ + Vectorized C10_ALWAYS_INLINE operator<( \ + const Vectorized& a, const Vectorized& b) { \ + return Vectorized{a.vec() < b.vec()}; \ + } \ + \ + Vectorized C10_ALWAYS_INLINE operator<=( \ + const Vectorized& a, const Vectorized& b) { \ + return Vectorized{a.vec() <= b.vec()}; \ +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } ZVECTOR_OPERATORS(c10::qint32) @@ -2185,6 +2547,15 @@ constexpr U log10e_inv() { } template +<<<<<<< HEAD +======= +struct is_vec_specialized_for< + T, + std::enable_if_t()>> + : std::bool_constant {}; + +template +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) struct Vectorized()>> { public: using underline_type = decltype(std::declval().imag()); @@ -2205,7 +2576,12 @@ struct Vectorized()>> { public: Vectorized() {} +<<<<<<< HEAD C10_ALWAYS_INLINE Vectorized(const vinner_data &v) : _vec{v.first, v.second} {} +======= + C10_ALWAYS_INLINE Vectorized(const vinner_data& v) + : _vec{v.first, v.second} {} +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) template = 0> C10_ALWAYS_INLINE Vectorized(T s1, T s2) @@ -2406,10 +2782,17 @@ struct Vectorized()>> { template < typename U = T, std::enable_if_t>::value, int> = 0> +<<<<<<< HEAD static typename Vectorized::vinner_type real_neg(const typename Vectorized::vinner_type &a) { const auto swap_mask = ZSimdVectBinary{ 0, 1, 2, 3, 20, 21, 22, 23, 8, 9, 10, 11, 28, 29, 30, 31}; +======= + static typename Vectorized::vinner_type real_neg( + const typename Vectorized::vinner_type& a) { + const auto swap_mask = ZSimdVectBinary{ + 0, 1, 2, 3, 20, 21, 22, 23, 8, 9, 10, 11, 28, 29, 30, 31}; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) auto a_neg = a.neg(); vtype v0 = vec_perm(a_neg.vec0(), a.vec0(), swap_mask); @@ -2420,12 +2803,21 @@ struct Vectorized()>> { template < typename U = T, std::enable_if_t>::value, int> = 0> +<<<<<<< HEAD static typename Vectorized::vinner_type real_neg(const typename Vectorized::vinner_type &a) { auto a_neg = a.neg(); vtype v0 = {a_neg.vec0()[0], a.vec0()[1]}; vtype v1 = {a_neg.vec1()[0], a.vec1()[1]}; return { v0, v1 }; +======= + static typename Vectorized::vinner_type real_neg( + const typename Vectorized::vinner_type& a) { + auto a_neg = a.neg(); + vtype v0 = {a_neg.vec0()[0], a.vec0()[1]}; + vtype v1 = {a_neg.vec1()[0], a.vec1()[1]}; + return {v0, v1}; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } Vectorized angle2_() const { @@ -2516,15 +2908,27 @@ struct Vectorized()>> { } Vectorized C10_ALWAYS_INLINE eq(const Vectorized& other) const { +<<<<<<< HEAD auto eq = _vec.eq(other._vec); // compares real and imag individually // If both real numbers and imag numbers are equal, then the complex numbers are equal +======= + auto eq = _vec.eq(other._vec); // compares real and imag individually + // If both real numbers and imag numbers are equal, then the complex numbers + // are equal +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) auto real = eq & vinner_type(real_mask()); auto imag = (eq & vinner_type(image_mask())).swapped(); return Vectorized{real & imag}; } Vectorized C10_ALWAYS_INLINE ne(const Vectorized& other) const { +<<<<<<< HEAD auto ne = _vec.ne(other._vec); // compares real and imag individually // If either real numbers or imag numbers are not equal, then the complex numbers are not equal +======= + auto ne = _vec.ne(other._vec); // compares real and imag individually + // If either real numbers or imag numbers are not equal, then the complex + // numbers are not equal +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) auto real = ne & vinner_type(real_mask()); auto imag = (ne & vinner_type(image_mask())).swapped(); return Vectorized{real | imag}; @@ -2551,8 +2955,12 @@ struct Vectorized()>> { return a.mergee().data(); } +<<<<<<< HEAD static T abs_helper(const T &value) { +======= + static T abs_helper(const T& value) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return T(std::abs(value)); } @@ -2633,6 +3041,7 @@ struct Vectorized()>> { } }; +<<<<<<< HEAD #define ZVECTOR_OPERATORS(typex) \ template <> \ Vectorized C10_ALWAYS_INLINE operator+(const Vectorized& a, const Vectorized& b) { \ @@ -2721,6 +3130,114 @@ struct Vectorized()>> { \ Vectorized C10_ALWAYS_INLINE operator>=(const Vectorized& a, const Vectorized& b) { \ TORCH_CHECK(false, "not supported for complex numbers"); \ +======= +#define ZVECTOR_OPERATORS(typex) \ + template <> \ + Vectorized C10_ALWAYS_INLINE operator+( \ + const Vectorized& a, const Vectorized& b) { \ + return Vectorized{a.vec() + b.vec()}; \ + } \ + \ + template <> \ + Vectorized C10_ALWAYS_INLINE operator-( \ + const Vectorized& a, const Vectorized& b) { \ + return Vectorized{a.vec() - b.vec()}; \ + } \ + \ + template <> \ + Vectorized inline operator*( \ + const Vectorized& a, const Vectorized& b) { \ + /* (a + bi) * (c + di) = (ac - bd) + (ad + bc)i */ \ + Vectorized::vinner_type bv = b.vec(); \ + \ + /* this is more z arch friendly than simulating horizontal from x86 */ \ + Vectorized::vinner_type vi = bv.mergeo(); \ + Vectorized::vinner_type vr = bv.mergee(); \ + vi = vi ^ \ + Vectorized::vinner_type( \ + rsign_mask::underline_type>()); \ + Vectorized::vinner_type ret = a.vec() * vr; \ + Vectorized::vinner_type vx_swapped = a.vec().swapped(); \ + ret = fmadd(vx_swapped, vi, ret); \ + \ + return Vectorized{ret}; \ + } \ + \ + template <> \ + Vectorized inline operator/( \ + const Vectorized& a, const Vectorized& b) { \ + /* Unfortunately, this breaks some tests */ \ + /* Implement it like it's done for avx2 */ \ + auto fabs_cd = b.vec().abs(); /* |c| |d| */ \ + auto fabs_dc = fabs_cd.swapped(); /* |d| |c| */ \ + auto scale = Vectorized::vinner_type{1.0} / \ + maximum(fabs_cd, fabs_dc); /* 1/sc 1/sc */ \ + auto a2 = a.vec() * scale; /* a/sc b/sc */ \ + auto b2 = b.vec() * scale; /* c/sc d/sc */ \ + auto acbd2 = a2 * b2; /* ac/sc^2 bd/sc^2 */ \ + \ + auto dc2 = b2.swapped(); /* d/sc c/sc */ \ + dc2 = Vectorized::real_neg(dc2); /* -d/|c,d| c/sc */ \ + auto adbc2 = a2 * dc2; /* -ad/sc^2 bc/sc^2 */ \ + auto sum1 = acbd2 + acbd2.swapped(); /* (ac+bd)/sc^2 (ac+bd)/sc^2 */ \ + auto sum2 = adbc2 + adbc2.swapped(); /* (bc-ad)/sc^2 (bc-ad)/sc^2 */ \ + auto res2 = Vectorized::vinner_type::mergee( \ + sum1, sum2); /* (ac+bd)/sc^2 (bc-ad)/sc^2 */ \ + \ + /* get the denominator */ \ + Vectorized::vinner_type denom2 = \ + Vectorized{b2}.abs_2_(); /* (c^2+d^2)/sc^2 (c^2+d^2)/sc^2 */ \ + res2 = res2 / denom2; \ + return Vectorized{res2}; \ + } \ + \ + template <> \ + Vectorized C10_ALWAYS_INLINE operator&( \ + const Vectorized& a, const Vectorized& b) { \ + return Vectorized{a.vec() & b.vec()}; \ + } \ + \ + template <> \ + Vectorized C10_ALWAYS_INLINE operator|( \ + const Vectorized& a, const Vectorized& b) { \ + return Vectorized{a.vec() | b.vec()}; \ + } \ + \ + template <> \ + Vectorized C10_ALWAYS_INLINE operator^( \ + const Vectorized& a, const Vectorized& b) { \ + return Vectorized{a.vec() ^ b.vec()}; \ + } \ + \ + Vectorized C10_ALWAYS_INLINE operator==( \ + const Vectorized& a, const Vectorized& b) { \ + return Vectorized{a.vec() == b.vec()}; \ + } \ + \ + Vectorized C10_ALWAYS_INLINE operator!=( \ + const Vectorized& a, const Vectorized& b) { \ + return Vectorized{a.vec() != b.vec()}; \ + } \ + \ + Vectorized C10_ALWAYS_INLINE operator<( \ + const Vectorized& a, const Vectorized& b) { \ + TORCH_CHECK(false, "not supported for complex numbers"); \ + } \ + \ + Vectorized C10_ALWAYS_INLINE operator<=( \ + const Vectorized& a, const Vectorized& b) { \ + TORCH_CHECK(false, "not supported for complex numbers"); \ + } \ + \ + Vectorized C10_ALWAYS_INLINE operator>( \ + const Vectorized& a, const Vectorized& b) { \ + TORCH_CHECK(false, "not supported for complex numbers"); \ + } \ + \ + Vectorized C10_ALWAYS_INLINE operator>=( \ + const Vectorized& a, const Vectorized& b) { \ + TORCH_CHECK(false, "not supported for complex numbers"); \ +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } ZVECTOR_OPERATORS(c10::complex) @@ -2872,28 +3389,55 @@ std::pair, Vectorized> inline deinterleave2< } template +<<<<<<< HEAD std::enable_if_t, at::vec::Vectorized> inline convert_int8_to_float(const Vectorized &src) { // Note: this function only convert inputs number of elements equal to at::vec::Vectorized.size() // Only handle first 64 bits +======= +std::enable_if_t< + std::is_same_v, + at::vec::Vectorized< + float>> inline convert_int8_to_float(const Vectorized& src) { + // Note: this function only convert inputs number of elements equal to + // at::vec::Vectorized.size() Only handle first 64 bits +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) auto vec_int = src.to_vec_float_helper(); return zvec_convert_to_float(vec_int); } template +<<<<<<< HEAD std::enable_if_t, at::vec::Vectorized> inline convert_float_to_int8(const Vectorized &src) { constexpr auto min_val = std::numeric_limits::min(); constexpr auto max_val = std::numeric_limits::max(); auto vec_int = clamp(zvec_convert_to_int(src), Vectorized(min_val), Vectorized(max_val)); +======= +std::enable_if_t< + std::is_same_v, + at::vec::Vectorized< + T>> inline convert_float_to_int8(const Vectorized& src) { + constexpr auto min_val = std::numeric_limits::min(); + constexpr auto max_val = std::numeric_limits::max(); + + auto vec_int = clamp( + zvec_convert_to_int(src), + Vectorized(min_val), + Vectorized(max_val)); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return vec_int.to_vec_uint8_helper(); } #undef DEFINE_CLAMP_MAXMIN_FUNCS #undef DEFINE_MAXMIN_FUNCS +<<<<<<< HEAD } // namespace +======= +} // namespace CPU_CAPABILITY +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } // namespace vec } // namespace at diff --git a/aten/src/ATen/cpu/vec/vec512/vec512.h b/aten/src/ATen/cpu/vec/vec512/vec512.h index d593d184c319..091ad39d4eb3 100644 --- a/aten/src/ATen/cpu/vec/vec512/vec512.h +++ b/aten/src/ATen/cpu/vec/vec512/vec512.h @@ -5,9 +5,17 @@ #include +<<<<<<< HEAD #include #include #include +======= +// clang-format off +#include +#include +#include +#include +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #include #include #include @@ -15,6 +23,10 @@ #include #include #include +<<<<<<< HEAD +======= +// clang-format on +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #include #include @@ -56,34 +68,58 @@ std::ostream& operator<<(std::ostream& stream, const Vectorized& vec) { return stream; } +<<<<<<< HEAD #if defined(CPU_CAPABILITY_AVX512) // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CAST (AVX512) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ template<> +======= +#if defined(CPU_CAPABILITY_AVX512) + +// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CAST (AVX512) +// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +template <> +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) inline Vectorized cast(const Vectorized& src) { return _mm512_castpd_ps(src); } +<<<<<<< HEAD template<> +======= +template <> +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) inline Vectorized cast(const Vectorized& src) { return _mm512_castps_pd(src); } +<<<<<<< HEAD template<> +======= +template <> +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) inline Vectorized cast(const Vectorized& src) { return _mm512_castsi512_ps(src); } +<<<<<<< HEAD template<> inline Vectorized cast(const Vectorized& src) { +======= +template <> +inline Vectorized cast( + const Vectorized& src) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return _mm512_castsi512_pd(src); } // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ GATHER ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #ifndef _MSC_VER // MSVC is not working well on complex function overload. +<<<<<<< HEAD template std::enable_if_t> inline gather(const double* base_addr, const Vectorized& vindex) { @@ -93,25 +129,60 @@ inline gather(const double* base_addr, const Vectorized& vindex) { template std::enable_if_t> inline gather(const float* base_addr, const Vectorized& vindex) { +======= +template +std::enable_if_t< + scale == 1 || scale == 2 || scale == 4 || scale == 8, + Vectorized< + double>> inline gather(const double* base_addr, const Vectorized& vindex) { + return _mm512_i64gather_pd(vindex, base_addr, scale); +} + +template +std::enable_if_t< + scale == 1 || scale == 2 || scale == 4 || scale == 8, + Vectorized< + float>> inline gather(const float* base_addr, const Vectorized& vindex) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return _mm512_i32gather_ps(vindex, base_addr, scale); } #endif // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ MASK GATHER ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #ifndef _MSC_VER // MSVC is not working well on complex function overload. +<<<<<<< HEAD template std::enable_if_t> inline mask_gather(const Vectorized& src, const double* base_addr, const Vectorized& vindex, Vectorized& mask) { +======= +template +std:: + enable_if_t> inline mask_gather( + const Vectorized& src, + const double* base_addr, + const Vectorized& vindex, + Vectorized& mask) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) auto all_ones = _mm512_castsi512_pd(_mm512_set1_epi64(0xFFFFFFFFFFFFFFFF)); auto mask_ = _mm512_cmp_pd_mask(all_ones, mask.values, _CMP_EQ_OQ); return _mm512_mask_i64gather_pd(src, mask_, vindex, base_addr, scale); } +<<<<<<< HEAD template std::enable_if_t> inline mask_gather(const Vectorized& src, const float* base_addr, const Vectorized& vindex, Vectorized& mask) { +======= +template +std:: + enable_if_t> inline mask_gather( + const Vectorized& src, + const float* base_addr, + const Vectorized& vindex, + Vectorized& mask) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) auto all_ones = _mm512_castsi512_ps(_mm512_set1_epi32(0xFFFFFFFF)); auto mask_ = _mm512_cmp_ps_mask(all_ones, mask.values, _CMP_EQ_OQ); return _mm512_mask_i32gather_ps(src, mask_, vindex, base_addr, scale); @@ -119,6 +190,7 @@ inline mask_gather(const Vectorized& src, const float* base_addr, #endif // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CONVERT ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +<<<<<<< HEAD template<> Vectorized inline convert_to_int_of_same_size(const Vectorized &src) { @@ -140,14 +212,43 @@ inline convert_to_fp_of_same_size(const Vectorized &src) { template<> Vectorized inline convert_to_fp_of_same_size(const Vectorized &src) { +======= +template <> +Vectorized inline convert_to_int_of_same_size( + const Vectorized& src) { + return _mm512_cvtpd_epi64(src); +} + +template <> +Vectorized inline convert_to_int_of_same_size( + const Vectorized& src) { + return _mm512_cvttps_epi32(src); +} + +template <> +Vectorized inline convert_to_fp_of_same_size( + const Vectorized& src) { + return _mm512_cvtepi64_pd(src); +} + +template <> +Vectorized inline convert_to_fp_of_same_size( + const Vectorized& src) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return _mm512_cvtepi32_ps(src); } // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ INTERLEAVE ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ template <> +<<<<<<< HEAD std::pair, Vectorized> inline interleave2(const Vectorized& a, const Vectorized& b) { +======= +std::pair, Vectorized> inline interleave2( + const Vectorized& a, + const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // inputs: // a = {a0, a1, a3, a3, a4, a5, a6, a7} // b = {b0, b1, b2, b3, b4, b5, b6, b7} @@ -156,6 +257,7 @@ inline interleave2(const Vectorized& a, const Vectorized // {a4, b4, a5, b5, a6, b6, a7, b7} __m512i idx1 = _mm512_set_epi64(11, 3, 10, 2, 9, 1, 8, 0); __m512i idx2 = _mm512_set_epi64(15, 7, 14, 6, 13, 5, 12, 4); +<<<<<<< HEAD return std::make_pair(_mm512_mask_permutex2var_pd(a, 0xff, idx1, b), _mm512_mask_permutex2var_pd(a, 0xff, idx2, b)); } @@ -176,19 +278,53 @@ inline interleave2(const Vectorized& a, const Vectorized& b 27, 11, 26, 10, 25, 9, 24, 8); return std::make_pair(_mm512_mask_permutex2var_ps(a, 0xffff, idx1, b), _mm512_mask_permutex2var_ps(a, 0xffff, idx2, b)); +======= + return std::make_pair( + _mm512_mask_permutex2var_pd(a, 0xff, idx1, b), + _mm512_mask_permutex2var_pd(a, 0xff, idx2, b)); +} + +template <> +std::pair, Vectorized> inline interleave2( + const Vectorized& a, + const Vectorized& b) { + // inputs: + // a = {a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, a14, + // a15} b = {b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, + // b14, b15} + // + // return: + // {a0, b0, a1, b1, a2, b2, a3, b3, a4, b4, a5, b5, a6, b6, a7, b7} + // {a8, b8, a9, b9, a10, b10, a11, b11, a12, b12, a13, b13, a14, b14, a15, + // b15} + __m512i idx1 = + _mm512_set_epi32(23, 7, 22, 6, 21, 5, 20, 4, 19, 3, 18, 2, 17, 1, 16, 0); + __m512i idx2 = _mm512_set_epi32( + 31, 15, 30, 14, 29, 13, 28, 12, 27, 11, 26, 10, 25, 9, 24, 8); + return std::make_pair( + _mm512_mask_permutex2var_ps(a, 0xffff, idx1, b), + _mm512_mask_permutex2var_ps(a, 0xffff, idx2, b)); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ DEINTERLEAVE ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ template <> +<<<<<<< HEAD std::pair, Vectorized> inline deinterleave2(const Vectorized& a, const Vectorized& b) { +======= +std::pair, Vectorized> inline deinterleave2( + const Vectorized& a, + const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // inputs: // a = {a0, b0, a1, b1, a2, b2, a3, b3} // b = {a4, b4, a5, b5, a6, b6, a7, b7} // output: // return {a0, a1, a2, a3, a4, a5, a6, a7} // {b0, b1, b2, b3, b4, b5, b6, b7} +<<<<<<< HEAD // The members of indices have been written in binary format for better understandability __m512i idx1 = _mm512_set_epi64(14, 12, 10, 8, 6, 4, 2, 0); __m512i idx2 = _mm512_set_epi64(15, 13, 11, 9, 7, 5, 3, 1); @@ -213,10 +349,44 @@ inline deinterleave2(const Vectorized& a, const Vectorized& return std::make_pair(_mm512_mask_permutex2var_ps(a, 0xffff, idx1, b), _mm512_mask_permutex2var_ps(a, 0xffff, idx2, b)); +======= + // The members of indices have been written in binary format for better + // understandability + __m512i idx1 = _mm512_set_epi64(14, 12, 10, 8, 6, 4, 2, 0); + __m512i idx2 = _mm512_set_epi64(15, 13, 11, 9, 7, 5, 3, 1); + + return std::make_pair( + _mm512_mask_permutex2var_pd(a, 0xff, idx1, b), + _mm512_mask_permutex2var_pd(a, 0xff, idx2, b)); +} + +template <> +std::pair, Vectorized> inline deinterleave2( + const Vectorized& a, + const Vectorized& b) { + // inputs: + // a = {a0, b0, a1, b1, a2, b2, a3, b3, a4, b4, a5, b5, a6, b6, a7, b7} + // b = {a8, b8, a9, b9, a10, b10, a11, b11, a12, b12, a13, b13, a14, b14, + // a15, b15} + // output: + // return {a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, a14, + // a15} + // {b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, + // b15} + __m512i idx1 = _mm512_set_epi32( + 30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0); + __m512i idx2 = _mm512_set_epi32( + 31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1); + + return std::make_pair( + _mm512_mask_permutex2var_ps(a, 0xffff, idx1, b), + _mm512_mask_permutex2var_ps(a, 0xffff, idx2, b)); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ FLIP ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +<<<<<<< HEAD template<> inline Vectorized flip(const Vectorized & v) { const __m512i mask = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, @@ -226,16 +396,33 @@ inline Vectorized flip(const Vectorized & v) { template<> inline Vectorized flip(const Vectorized & v) { +======= +template <> +inline Vectorized flip(const Vectorized& v) { + const __m512i mask = + _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + return _mm512_permutexvar_ps(mask, v); +} + +template <> +inline Vectorized flip(const Vectorized& v) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) const __m512i mask = _mm512_set_epi64(0, 1, 2, 3, 4, 5, 6, 7); return _mm512_permutexvar_pd(mask, v); } +<<<<<<< HEAD template<> inline Vectorized flip(const Vectorized & v) { +======= +template <> +inline Vectorized flip(const Vectorized& v) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) const __m512i mask = _mm512_set_epi64(0, 1, 2, 3, 4, 5, 6, 7); return _mm512_permutexvar_epi64(mask, v); } +<<<<<<< HEAD template<> inline Vectorized flip(const Vectorized & v) { const __m512i mask = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, @@ -259,11 +446,125 @@ inline __m512i flip8(const __m512i & v) { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 ); +======= +template <> +inline Vectorized flip(const Vectorized& v) { + const __m512i mask = + _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + return _mm512_permutexvar_epi32(mask, v); +} + +template <> +inline Vectorized flip(const Vectorized& v) { + const __m512i mask = _mm512_set_epi16( + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 29, + 30, + 31); + return _mm512_permutexvar_epi16(mask, v); +} + +inline __m512i flip8(const __m512i& v) { + const __m512i mask1 = _mm512_set_epi8( + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) const __m512i mask2 = _mm512_set_epi64(1, 0, 3, 2, 5, 4, 7, 6); auto reversed_vec = _mm512_shuffle_epi8(v, mask1); return _mm512_permutexvar_epi64(mask2, reversed_vec); } +<<<<<<< HEAD template<> inline Vectorized flip(const Vectorized & v) { return flip8(v); @@ -271,6 +572,15 @@ inline Vectorized flip(const Vectorized & v) { template<> inline Vectorized flip(const Vectorized & v) { +======= +template <> +inline Vectorized flip(const Vectorized& v) { + return flip8(v); +} + +template <> +inline Vectorized flip(const Vectorized& v) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return flip8(v); } @@ -288,4 +598,10 @@ inline Vectorized operator&&( #endif // defined(CPU_CAPABILITY_AVX512) +<<<<<<< HEAD }}} +======= +} // namespace CPU_CAPABILITY +} // namespace vec +} // namespace at +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_bfloat16.h b/aten/src/ATen/cpu/vec/vec512/vec512_bfloat16.h index f116929f8b08..7ddca47f000a 100644 --- a/aten/src/ATen/cpu/vec/vec512/vec512_bfloat16.h +++ b/aten/src/ATen/cpu/vec/vec512/vec512_bfloat16.h @@ -12,7 +12,10 @@ #include #endif +<<<<<<< HEAD +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) namespace at::vec { // See Note [CPU_CAPABILITY namespace] inline namespace CPU_CAPABILITY { @@ -85,7 +88,12 @@ static inline __m512i cvtfp32_bf16(const __m512& a, const __m512& b) { t_lo = _mm512_mask_blend_epi32(mask_lo, nan, t_lo); t_hi = _mm512_mask_blend_epi32(mask_hi, nan, t_hi); +<<<<<<< HEAD t_lo = _mm512_packus_epi32(t_lo, t_hi); // t_hi[4-7] t_lo[4-7] t_hi[0-4] t_lo[0-4] +======= + t_lo = _mm512_packus_epi32( + t_lo, t_hi); // t_hi[4-7] t_lo[4-7] t_hi[0-4] t_lo[0-4] +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) __m512i idx = _mm512_set_epi64(7, 5, 3, 1, 6, 4, 2, 0); return _mm512_permutexvar_epi64(idx, t_lo); } @@ -113,6 +121,7 @@ static inline void cvtfp16_fp32(const __m512i& a, __m512& o1, __m512& o2) { } static inline __m256i cvtfp32_fp16(const __m512& src) { +<<<<<<< HEAD return _mm512_cvtps_ph( src, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)); } @@ -122,12 +131,23 @@ static inline __m512i cvtfp32_fp16(const __m512& a, const __m512& b) { a, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)); __m256i hi = _mm512_cvtps_ph( b, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)); +======= + return _mm512_cvtps_ph(src, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)); +} + +static inline __m512i cvtfp32_fp16(const __m512& a, const __m512& b) { + __m256i lo = + _mm512_cvtps_ph(a, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)); + __m256i hi = + _mm512_cvtps_ph(b, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) __m512 t_lo = _mm512_castsi512_ps(_mm512_castsi256_si512(lo)); __m256 t_hi = _mm256_castsi256_ps(hi); return _mm512_castps_si512(_mm512_insertf32x8(t_lo, t_hi, 1)); } // dtype conversion between float16/bfloat16 and float32 +<<<<<<< HEAD template , int> = 0> inline void cvt_to_fp32(const __m256i& a, __m512& o); template <> inline void cvt_to_fp32(const __m256i& a, __m512& o) { @@ -159,17 +179,77 @@ template <> inline __m512i cvt_from_fp32(const __m512& a, const __m return cvtfp32_fp16(a, b); } template <> inline __m512i cvt_from_fp32(const __m512& a, const __m512& b) { +======= +template < + typename T, + typename std::enable_if_t, int> = 0> +inline void cvt_to_fp32(const __m256i& a, __m512& o); +template <> +inline void cvt_to_fp32(const __m256i& a, __m512& o) { + cvtbf16_fp32(a, o); +} +template <> +inline void cvt_to_fp32(const __m256i& a, __m512& o) { + cvtfp16_fp32(a, o); +} + +template < + typename T, + typename std::enable_if_t, int> = 0> +inline void cvt_to_fp32(const __m512i& a, __m512& o1, __m512& o2); +template <> +inline void cvt_to_fp32(const __m512i& a, __m512& o1, __m512& o2) { + cvtbf16_fp32(a, o1, o2); +} +template <> +inline void cvt_to_fp32(const __m512i& a, __m512& o1, __m512& o2) { + cvtfp16_fp32(a, o1, o2); +} + +template < + typename T, + bool is_compare_op = false, + typename std::enable_if_t, int> = 0> +inline __m512i cvt_from_fp32(const __m512& a, const __m512& b); +template <> +inline __m512i cvt_from_fp32( + const __m512& a, + const __m512& b) { + return cvtfp32_bf16(a, b); +} +template <> +inline __m512i cvt_from_fp32(const __m512& a, const __m512& b) { + return merge_compare_result(a, b); +} +template <> +inline __m512i cvt_from_fp32(const __m512& a, const __m512& b) { + return cvtfp32_fp16(a, b); +} +template <> +inline __m512i cvt_from_fp32(const __m512& a, const __m512& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return cvtfp32_fp16(a, b); } template class Vectorized16 { +<<<<<<< HEAD static_assert( is_reduced_floating_point_v, "Support only float16 and bfloat16."); private: __m512i values; public: +======= + static_assert( + is_reduced_floating_point_v, + "Support only float16 and bfloat16."); + + private: + __m512i values; + + public: +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) using value_type = uint16_t; using size_type = int; static constexpr size_type size() { @@ -181,6 +261,7 @@ static_assert( value_type uw = val.x; values = _mm512_set1_epi16(uw); } +<<<<<<< HEAD Vectorized16(T val1, T val2, T val3, T val4, T val5, T val6, T val7, T val8, T val9, T val10, T val11, T val12, @@ -194,14 +275,89 @@ static_assert( val24.x, val23.x, val22.x, val21.x, val20.x, val19.x, val18.x, val17.x, val16.x, val15.x, val14.x, val13.x, val12.x, val11.x, val10.x, val9.x, val8.x, val7.x, val6.x, val5.x, val4.x, val3.x, val2.x, val1.x); +======= + Vectorized16( + T val1, + T val2, + T val3, + T val4, + T val5, + T val6, + T val7, + T val8, + T val9, + T val10, + T val11, + T val12, + T val13, + T val14, + T val15, + T val16, + T val17, + T val18, + T val19, + T val20, + T val21, + T val22, + T val23, + T val24, + T val25, + T val26, + T val27, + T val28, + T val29, + T val30, + T val31, + T val32) { + values = _mm512_set_epi16( + val32.x, + val31.x, + val30.x, + val29.x, + val28.x, + val27.x, + val26.x, + val25.x, + val24.x, + val23.x, + val22.x, + val21.x, + val20.x, + val19.x, + val18.x, + val17.x, + val16.x, + val15.x, + val14.x, + val13.x, + val12.x, + val11.x, + val10.x, + val9.x, + val8.x, + val7.x, + val6.x, + val5.x, + val4.x, + val3.x, + val2.x, + val1.x); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } operator __m512i() const { return values; } T& operator[](int idx) = delete; +<<<<<<< HEAD const T& operator[](int idx) const = delete; int zero_mask() const { // returns an integer mask where all zero elements are translated to 1-bit and others are translated to 0-bit +======= + const T& operator[](int idx) const = delete; + int zero_mask() const { + // returns an integer mask where all zero elements are translated to 1-bit + // and others are translated to 0-bit +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return _mm512_cmpeq_epi16_mask(values, _mm512_set1_epi16(0)); } static Vectorized loadu(const void* ptr, int16_t count = size()) { @@ -223,12 +379,20 @@ static_assert( static Vectorized blend(const Vectorized& a, const Vectorized& b) { return _mm512_mask_blend_epi16(mask, a.values, b.values); } +<<<<<<< HEAD static Vectorized blendv(const Vectorized& a, const Vectorized& b, const Vectorized& mask) { +======= + static Vectorized blendv( + const Vectorized& a, + const Vectorized& b, + const Vectorized& mask) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) auto all_ones = _mm512_set1_epi16(0xFFFF); auto mask_ = _mm512_cmp_epi16_mask(mask, all_ones, _MM_CMPINT_EQ); return _mm512_mask_blend_epi16(mask_, a.values, b.values); } +<<<<<<< HEAD template static Vectorized arange(T base = 0.f, step_t step = static_cast(1)) { return Vectorized( @@ -243,6 +407,50 @@ static_assert( } static Vectorized set(const Vectorized& a, const Vectorized& b, int64_t count = size()) { +======= + template + static Vectorized arange( + T base = 0.f, + step_t step = static_cast(1)) { + return Vectorized( + base, + base + step, + base + 2 * step, + base + 3 * step, + base + 4 * step, + base + 5 * step, + base + 6 * step, + base + 7 * step, + base + 8 * step, + base + 9 * step, + base + 10 * step, + base + 11 * step, + base + 12 * step, + base + 13 * step, + base + 14 * step, + base + 15 * step, + base + 16 * step, + base + 17 * step, + base + 18 * step, + base + 19 * step, + base + 20 * step, + base + 21 * step, + base + 22 * step, + base + 23 * step, + base + 24 * step, + base + 25 * step, + base + 26 * step, + base + 27 * step, + base + 28 * step, + base + 29 * step, + base + 30 * step, + base + 31 * step); + } + static Vectorized set( + const Vectorized& a, + const Vectorized& b, + int64_t count = size()) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) switch (count) { case 0: return a; @@ -311,8 +519,13 @@ static_assert( } return b; } +<<<<<<< HEAD #pragma clang diagnostic push #pragma clang diagnostic ignored "-Wignored-qualifiers" +======= +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wignored-qualifiers" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) Vectorized map(SLEEF_CONST __m512 (*SLEEF_CONST_OLD vop)(__m512)) const { __m512 lo, hi; @@ -328,12 +541,23 @@ static_assert( __m512 zero = _mm512_set1_ps(0.0); __m512i zeroi = _mm512_castps_si512(zero); lo_mask = _mm512_cmp_ps_mask(lo, zero, _CMP_UNORD_Q); +<<<<<<< HEAD lo = _mm512_castsi512_ps(_mm512_mask_set1_epi32(zeroi, lo_mask, 0xFFFF'FFFF)); hi_mask = _mm512_cmp_ps_mask(hi, zero, _CMP_UNORD_Q); hi = _mm512_castsi512_ps(_mm512_mask_set1_epi32(zeroi, hi_mask, 0xFFFF'FFFF)); return merge_compare_result(lo, hi); } #pragma clang diagnostic pop +======= + lo = _mm512_castsi512_ps( + _mm512_mask_set1_epi32(zeroi, lo_mask, 0xFFFF'FFFF)); + hi_mask = _mm512_cmp_ps_mask(hi, zero, _CMP_UNORD_Q); + hi = _mm512_castsi512_ps( + _mm512_mask_set1_epi32(zeroi, hi_mask, 0xFFFF'FFFF)); + return merge_compare_result(lo, hi); + } +#pragma clang diagnostic pop +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) Vectorized abs() const { return _mm512_andnot_si512(_mm512_set1_epi16(0x8000), values); } @@ -344,10 +568,17 @@ static_assert( const auto zero_vec = _mm512_set1_ps(0.f); const auto nan_vec = _mm512_set1_ps(NAN); const auto not_nan_mask = _mm512_cmp_ps_mask(values, values, _CMP_EQ_OQ); +<<<<<<< HEAD const auto non_nan_mask_vec = _mm512_mask_set1_epi32(_mm512_castps_si512(zero_vec), not_nan_mask, 0xFFFFFFFF); const auto nan_mask = _mm512_cmp_ps_mask(_mm512_castsi512_ps(non_nan_mask_vec), zero_vec, _CMP_EQ_OQ); +======= + const auto non_nan_mask_vec = _mm512_mask_set1_epi32( + _mm512_castps_si512(zero_vec), not_nan_mask, 0xFFFFFFFF); + const auto nan_mask = _mm512_cmp_ps_mask( + _mm512_castsi512_ps(non_nan_mask_vec), zero_vec, _CMP_EQ_OQ); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) const auto pi = _mm512_set1_ps(c10::pi); const auto neg_mask = _mm512_cmp_ps_mask(values, zero_vec, _CMP_LT_OQ); @@ -386,7 +617,11 @@ static_assert( Vectorized atanh() const { return map(Sleef_atanhf16_u10); } +<<<<<<< HEAD Vectorized atan2(const Vectorized &b) const { +======= + Vectorized atan2(const Vectorized& b) const { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) __m512 lo, hi; __m512 b1, b2; cvt_to_fp32(values, lo, hi); @@ -395,12 +630,20 @@ static_assert( auto o2 = Sleef_atan2f16_u10(hi, b2); return cvt_from_fp32(o1, o2); } +<<<<<<< HEAD Vectorized copysign(const Vectorized &sign) const { // copy sign bit (0x8000) from sign and remaining bits from values __m512i mask_value = _mm512_set1_epi32(~0x80008000); __m512i mask_signbit = _mm512_set1_epi32(0x80008000); return Vectorized( _mm512_or_si512( +======= + Vectorized copysign(const Vectorized& sign) const { + // copy sign bit (0x8000) from sign and remaining bits from values + __m512i mask_value = _mm512_set1_epi32(~0x80008000); + __m512i mask_signbit = _mm512_set1_epi32(0x80008000); + return Vectorized(_mm512_or_si512( +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) _mm512_and_si512(values, mask_value), _mm512_and_si512(sign, mask_signbit))); } @@ -436,7 +679,11 @@ static_assert( Vectorized exp_u20() const { return exp(); } +<<<<<<< HEAD Vectorized fmod(const Vectorized & q) const { +======= + Vectorized fmod(const Vectorized& q) const { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) __m512 x_lo, x_hi; cvt_to_fp32(values, x_lo, x_hi); __m512 q_lo, q_hi; @@ -445,7 +692,11 @@ static_assert( auto o2 = Sleef_fmodf16(x_hi, q_hi); return cvt_from_fp32(o1, o2); } +<<<<<<< HEAD Vectorized hypot(const Vectorized &b) const { +======= + Vectorized hypot(const Vectorized& b) const { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) __m512 lo, hi; __m512 b1, b2; cvt_to_fp32(values, lo, hi); @@ -500,7 +751,11 @@ static_assert( const auto o2 = _mm512_loadu_ps(tmp2); return cvt_from_fp32(o1, o2); } +<<<<<<< HEAD Vectorized igamma(const Vectorized &x) const { +======= + Vectorized igamma(const Vectorized& x) const { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) __m512 lo, hi; __m512 xlo, xhi; cvt_to_fp32(values, lo, hi); @@ -520,7 +775,11 @@ static_assert( return cvt_from_fp32(o1, o2); } +<<<<<<< HEAD Vectorized igammac(const Vectorized &x) const { +======= + Vectorized igammac(const Vectorized& x) const { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) __m512 lo, hi; __m512 xlo, xhi; cvt_to_fp32(values, lo, hi); @@ -583,8 +842,15 @@ static_assert( Vectorized round() const { __m512 lo, hi; cvt_to_fp32(values, lo, hi); +<<<<<<< HEAD auto o1 = _mm512_roundscale_ps(lo, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)); auto o2 = _mm512_roundscale_ps(hi, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)); +======= + auto o1 = _mm512_roundscale_ps( + lo, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)); + auto o2 = _mm512_roundscale_ps( + hi, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return cvt_from_fp32(o1, o2); } Vectorized tan() const { @@ -596,8 +862,15 @@ static_assert( Vectorized trunc() const { __m512 lo, hi; cvt_to_fp32(values, lo, hi); +<<<<<<< HEAD auto o1 = _mm512_roundscale_ps(lo, (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)); auto o2 = _mm512_roundscale_ps(hi, (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)); +======= + auto o1 = + _mm512_roundscale_ps(lo, (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)); + auto o2 = + _mm512_roundscale_ps(hi, (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return cvt_from_fp32(o1, o2); } Vectorized lgamma() const { @@ -626,7 +899,11 @@ static_assert( auto o2 = _mm512_div_ps(ones, _mm512_sqrt_ps(hi)); return cvt_from_fp32(o1, o2); } +<<<<<<< HEAD Vectorized pow(const Vectorized &b) const { +======= + Vectorized pow(const Vectorized& b) const { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) __m512 lo, hi; __m512 b1, b2; cvt_to_fp32(values, lo, hi); @@ -635,8 +912,14 @@ static_assert( auto o2 = Sleef_powf16_u10(hi, b2); return cvt_from_fp32(o1, o2); } +<<<<<<< HEAD private: template +======= + + private: + template +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) Vectorized inline binary_compare(const VectorizedType& b, Op op) const { __m512 a_lo, a_hi; __m512 b_lo, b_hi; @@ -644,56 +927,101 @@ static_assert( cvt_to_fp32(b.values, b_lo, b_hi); auto o1 = op(a_lo, b_lo); auto o2 = op(a_hi, b_hi); +<<<<<<< HEAD return cvt_from_fp32(o1, o2); } public: +======= + return cvt_from_fp32(o1, o2); + } + + public: +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) Vectorized inline operator>(const Vectorized& other) const { return binary_compare(other, [](__m512 x, __m512 y) { auto zero_vec = _mm512_set1_epi32(0); auto cmp = _mm512_cmp_ps_mask(x, y, _CMP_GT_OQ); +<<<<<<< HEAD return _mm512_castsi512_ps(_mm512_mask_set1_epi32(zero_vec, cmp, 0xFFFFFFFF)); +======= + return _mm512_castsi512_ps( + _mm512_mask_set1_epi32(zero_vec, cmp, 0xFFFFFFFF)); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) }); } Vectorized inline operator<(const Vectorized& other) const { return binary_compare(other, [](__m512 x, __m512 y) { auto zero_vec = _mm512_set1_epi32(0); auto cmp = _mm512_cmp_ps_mask(x, y, _CMP_LT_OQ); +<<<<<<< HEAD return _mm512_castsi512_ps(_mm512_mask_set1_epi32(zero_vec, cmp, 0xFFFFFFFF)); +======= + return _mm512_castsi512_ps( + _mm512_mask_set1_epi32(zero_vec, cmp, 0xFFFFFFFF)); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) }); } Vectorized inline operator>=(const Vectorized& other) const { return binary_compare(other, [](__m512 x, __m512 y) { auto zero_vec = _mm512_set1_epi32(0); auto cmp = _mm512_cmp_ps_mask(x, y, _CMP_GE_OQ); +<<<<<<< HEAD return _mm512_castsi512_ps(_mm512_mask_set1_epi32(zero_vec, cmp, 0xFFFFFFFF)); +======= + return _mm512_castsi512_ps( + _mm512_mask_set1_epi32(zero_vec, cmp, 0xFFFFFFFF)); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) }); } Vectorized inline operator<=(const Vectorized& other) const { return binary_compare(other, [](__m512 x, __m512 y) { auto zero_vec = _mm512_set1_epi32(0); auto cmp = _mm512_cmp_ps_mask(x, y, _CMP_LE_OQ); +<<<<<<< HEAD return _mm512_castsi512_ps(_mm512_mask_set1_epi32(zero_vec, cmp, 0xFFFFFFFF)); +======= + return _mm512_castsi512_ps( + _mm512_mask_set1_epi32(zero_vec, cmp, 0xFFFFFFFF)); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) }); } Vectorized inline operator==(const Vectorized16& other) const { return binary_compare(other, [](__m512 x, __m512 y) { auto zero_vec = _mm512_set1_epi32(0); auto cmp = _mm512_cmp_ps_mask(x, y, _CMP_EQ_OQ); +<<<<<<< HEAD return _mm512_castsi512_ps(_mm512_mask_set1_epi32(zero_vec, cmp, 0xFFFFFFFF)); +======= + return _mm512_castsi512_ps( + _mm512_mask_set1_epi32(zero_vec, cmp, 0xFFFFFFFF)); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) }); } Vectorized inline operator!=(const Vectorized16& other) const { return binary_compare(other, [](__m512 x, __m512 y) { auto zero_vec = _mm512_set1_epi32(0); auto cmp = _mm512_cmp_ps_mask(x, y, _CMP_NEQ_UQ); +<<<<<<< HEAD return _mm512_castsi512_ps(_mm512_mask_set1_epi32(zero_vec, cmp, 0xFFFFFFFF)); +======= + return _mm512_castsi512_ps( + _mm512_mask_set1_epi32(zero_vec, cmp, 0xFFFFFFFF)); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) }); } }; +<<<<<<< HEAD template static inline Vectorized binary_op_as_fp32(const Vectorized& a, const Vectorized& b, Op op) { +======= +template +static inline Vectorized binary_op_as_fp32( + const Vectorized& a, + const Vectorized& b, + Op op) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) __m512 a_lo, a_hi; __m512 b_lo, b_hi; cvt_to_fp32(__m512i(a), a_lo, a_hi); @@ -704,8 +1032,16 @@ static inline Vectorized binary_op_as_fp32(const Vectorized& a, const Vect } template <> +<<<<<<< HEAD class Vectorized: public Vectorized16 { public: +======= +struct is_vec_specialized_for : std::bool_constant {}; + +template <> +class Vectorized : public Vectorized16 { + public: +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) using Vectorized16::Vectorized16; using value_type = BFloat16; @@ -720,6 +1056,7 @@ class Vectorized: public Vectorized16 { Vectorized le(const Vectorized& other) const; }; +<<<<<<< HEAD Vectorized inline operator+(const Vectorized& a, const Vectorized& b) { return binary_op_as_fp32(a, b, [](const __m512& x, const __m512& y) { return _mm512_add_ps(x, y); }); } @@ -763,6 +1100,79 @@ inline Vectorized Vectorized::lt(const Vectorized& } inline Vectorized Vectorized::le(const Vectorized& other) const { +======= +Vectorized inline operator+( + const Vectorized& a, + const Vectorized& b) { + return binary_op_as_fp32(a, b, [](const __m512& x, const __m512& y) { + return _mm512_add_ps(x, y); + }); +} +Vectorized inline operator-( + const Vectorized& a, + const Vectorized& b) { + return binary_op_as_fp32(a, b, [](const __m512& x, const __m512& y) { + return _mm512_sub_ps(x, y); + }); +} +Vectorized inline operator*( + const Vectorized& a, + const Vectorized& b) { + return binary_op_as_fp32(a, b, [](const __m512& x, const __m512& y) { + return _mm512_mul_ps(x, y); + }); +} +Vectorized inline operator/( + const Vectorized& a, + const Vectorized& b) { + return binary_op_as_fp32(a, b, [](const __m512& x, const __m512& y) { + return _mm512_div_ps(x, y); + }); +} +Vectorized inline operator&( + const Vectorized& a, + const Vectorized& b) { + return _mm512_and_si512(a, b); +} +Vectorized inline operator|( + const Vectorized& a, + const Vectorized& b) { + return _mm512_or_si512(a, b); +} +Vectorized inline operator^( + const Vectorized& a, + const Vectorized& b) { + return _mm512_xor_si512(a, b); +} + +inline Vectorized Vectorized::eq( + const Vectorized& other) const { + return (*this == other) & Vectorized(1.0f); +} + +inline Vectorized Vectorized::ne( + const Vectorized& other) const { + return (*this != other) & Vectorized(1.0f); +} + +inline Vectorized Vectorized::gt( + const Vectorized& other) const { + return (*this > other) & Vectorized(1.0f); +} + +inline Vectorized Vectorized::ge( + const Vectorized& other) const { + return (*this >= other) & Vectorized(1.0f); +} + +inline Vectorized Vectorized::lt( + const Vectorized& other) const { + return (*this < other) & Vectorized(1.0f); +} + +inline Vectorized Vectorized::le( + const Vectorized& other) const { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return (*this <= other) & Vectorized(1.0f); } @@ -774,7 +1184,13 @@ inline Vectorized Vectorized::frac() const { // Implements the IEEE 754 201X `maximum` operation, which propagates NaN if // either input is a NaN. template <> +<<<<<<< HEAD Vectorized inline maximum(const Vectorized& a, const Vectorized& b) { +======= +Vectorized inline maximum( + const Vectorized& a, + const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) __m512 a_lo, a_hi; __m512 b_lo, b_hi; cvtbf16_fp32(__m512i(a), a_lo, a_hi); @@ -794,7 +1210,13 @@ Vectorized inline maximum(const Vectorized& a, const Vectori // Implements the IEEE 754 201X `minimum` operation, which propagates NaN if // either input is a NaN. template <> +<<<<<<< HEAD Vectorized inline minimum(const Vectorized& a, const Vectorized& b) { +======= +Vectorized inline minimum( + const Vectorized& a, + const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) __m512 a_lo, a_hi; __m512 b_lo, b_hi; __m512i zero_vec = _mm512_set1_epi32(0); @@ -804,10 +1226,17 @@ Vectorized inline minimum(const Vectorized& a, const Vectori auto min_hi = _mm512_min_ps(a_hi, b_hi); auto nan_lo_mask = _mm512_cmp_ps_mask(a_lo, b_lo, _CMP_UNORD_Q); auto nan_hi_mask = _mm512_cmp_ps_mask(a_hi, b_hi, _CMP_UNORD_Q); +<<<<<<< HEAD auto nan_lo = _mm512_castsi512_ps(_mm512_mask_set1_epi32(zero_vec, nan_lo_mask, 0xFFFFFFFF)); auto nan_hi = _mm512_castsi512_ps(_mm512_mask_set1_epi32(zero_vec, nan_hi_mask, 0xFFFFFFFF)); +======= + auto nan_lo = _mm512_castsi512_ps( + _mm512_mask_set1_epi32(zero_vec, nan_lo_mask, 0xFFFFFFFF)); + auto nan_hi = _mm512_castsi512_ps( + _mm512_mask_set1_epi32(zero_vec, nan_hi_mask, 0xFFFFFFFF)); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // Exploit the fact that all-ones is a NaN. auto o1 = _mm512_or_ps(min_lo, nan_lo); auto o2 = _mm512_or_ps(min_hi, nan_hi); @@ -815,8 +1244,15 @@ Vectorized inline minimum(const Vectorized& a, const Vectori } template <> +<<<<<<< HEAD Vectorized inline clamp(const Vectorized& a, const Vectorized& min, const Vectorized& max) { +======= +Vectorized inline clamp( + const Vectorized& a, + const Vectorized& min, + const Vectorized& max) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) __m512 a_lo, a_hi; __m512 min_lo, min_hi; __m512 max_lo, max_hi; @@ -829,7 +1265,13 @@ Vectorized inline clamp(const Vectorized& a, } template <> +<<<<<<< HEAD Vectorized inline clamp_max(const Vectorized& a, const Vectorized& max) { +======= +Vectorized inline clamp_max( + const Vectorized& a, + const Vectorized& max) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) __m512 a_lo, a_hi; __m512 max_lo, max_hi; cvtbf16_fp32(__m512i(a), a_lo, a_hi); @@ -840,7 +1282,13 @@ Vectorized inline clamp_max(const Vectorized& a, const Vecto } template <> +<<<<<<< HEAD Vectorized inline clamp_min(const Vectorized& a, const Vectorized& min) { +======= +Vectorized inline clamp_min( + const Vectorized& a, + const Vectorized& min) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) __m512 a_lo, a_hi; __m512 min_lo, min_hi; cvtbf16_fp32(__m512i(a), a_lo, a_hi); @@ -856,8 +1304,15 @@ inline void convert(const BFloat16* src, BFloat16* dst, int64_t n) { #ifndef __msvc_cl__ #pragma unroll #endif +<<<<<<< HEAD for (i = 0; i <= (n - Vectorized::size()); i += Vectorized::size()) { auto vsrc = _mm512_loadu_si512(reinterpret_cast<__m512i*>((void*)(src + i))); +======= + for (i = 0; i <= (n - Vectorized::size()); + i += Vectorized::size()) { + auto vsrc = + _mm512_loadu_si512(reinterpret_cast<__m512i*>((void*)(src + i))); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) _mm512_storeu_si512(reinterpret_cast<__m512i*>((void*)(dst + i)), vsrc); } #ifndef __msvc_cl__ @@ -871,7 +1326,12 @@ inline void convert(const BFloat16* src, BFloat16* dst, int64_t n) { template <> inline void convert(const float* src, BFloat16* dst, int64_t n) { int64_t i; +<<<<<<< HEAD for (i = 0; i + Vectorized::size() <= n; i += Vectorized::size()) { +======= + for (i = 0; i + Vectorized::size() <= n; + i += Vectorized::size()) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) __m512 a = _mm512_loadu_ps(&src[i]); __m512 b = _mm512_loadu_ps(&src[i + 16]); @@ -885,7 +1345,11 @@ inline void convert(const float* src, BFloat16* dst, int64_t n) { template <> inline void convert(const double* src, BFloat16* dst, int64_t n) { +<<<<<<< HEAD auto load_float = [](const double *src) -> __m512 { +======= + auto load_float = [](const double* src) -> __m512 { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // Load one float vector from an array of doubles __m256 a = _mm512_cvtpd_ps(_mm512_loadu_pd(src)); __m256 b = _mm512_cvtpd_ps(_mm512_loadu_pd(src + 8)); @@ -893,7 +1357,12 @@ inline void convert(const double* src, BFloat16* dst, int64_t n) { }; int64_t i; +<<<<<<< HEAD for (i = 0; i + Vectorized::size() <= n; i += Vectorized::size()) { +======= + for (i = 0; i + Vectorized::size() <= n; + i += Vectorized::size()) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) __m512 a = load_float(&src[i]); __m512 b = load_float(&src[i + 16]); @@ -906,8 +1375,15 @@ inline void convert(const double* src, BFloat16* dst, int64_t n) { } template <> +<<<<<<< HEAD Vectorized inline fmadd(const Vectorized& a, const Vectorized& b, const Vectorized& c) { +======= +Vectorized inline fmadd( + const Vectorized& a, + const Vectorized& b, + const Vectorized& c) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) __m512 a_lo, a_hi; __m512 b_lo, b_hi; __m512 c_lo, c_hi; @@ -921,6 +1397,7 @@ Vectorized inline fmadd(const Vectorized& a, static inline void _transpose_mxn_half_16_16(__m256i t[], __m512i u[]) { __m512i r[8]; +<<<<<<< HEAD // a0a1 a2a3 a4a5 a6a7 a8a9 a10a11 a12a13 a14a15 e0e1 e2e3 e4e5 e6e7 e8e9 e10e11 e12e13 e14e15 // b0-b15 f0-f15 // c0-c15 g0-g15 @@ -929,11 +1406,17 @@ static inline void _transpose_mxn_half_16_16(__m256i t[], __m512i u[]) { // j0-j15 n0-n15 // k0-k15 o0-o15 // l0-l15 p0-p15 +======= + // a0a1 a2a3 a4a5 a6a7 a8a9 a10a11 a12a13 a14a15 e0e1 e2e3 e4e5 e6e7 e8e9 + // e10e11 e12e13 e14e15 b0-b15 f0-f15 c0-c15 g0-g15 d0-d15 h0-h15 i0-i15 + // m0-m15 j0-j15 n0-n15 k0-k15 o0-o15 l0-l15 p0-p15 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #ifndef __msvc_cl__ #pragma unroll(4) #endif for (int i = 0; i < 4; i++) { r[i] = _mm512_inserti64x4(_mm512_castsi256_si512(t[i]), t[i + 4], 0x01); +<<<<<<< HEAD r[i + 4] = _mm512_inserti64x4(_mm512_castsi256_si512(t[i + 8]), t[i + 12], 0x01); } @@ -943,6 +1426,18 @@ static inline void _transpose_mxn_half_16_16(__m256i t[], __m512i u[]) { // u3: c4c5 d4b5 c6c7 d6b7 c12c13 d12d13 c14c15 d14d15 g4g5 h4h5 g6g7 h6h7 g12g13 h12h13 g14g15 h14h15 // i j m n // k l o p +======= + r[i + 4] = + _mm512_inserti64x4(_mm512_castsi256_si512(t[i + 8]), t[i + 12], 0x01); + } + + // u0: a0a1 b0b1 a2a3 b2b3 a8a9 b8b9 a10a11 b10b11 e0e1 f0f1 e2e3 f2f3 e8e9 + // f8f9 e10e11 f10f11 u1: a4a5 b4b5 a6a7 b6b7 a12a13 b12b13 a14a15 b14b15 e4e5 + // f4f5 e6e7 f6f7 e12e13 f12f13 e14e15 f14f15 u2: c0c1 d0d1 c2c3 d2d3 c8c9 + // d8d9 c10c11 d10d11 g0g1 h0h1 g2g3 h2h3 g8g9 h8h9 g10g11 h10h11 u3: c4c5 + // d4b5 c6c7 d6b7 c12c13 d12d13 c14c15 d14d15 g4g5 h4h5 g6g7 h6h7 g12g13 + // h12h13 g14g15 h14h15 i j m n k l o p +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #ifndef __msvc_cl__ #pragma unroll(4) #endif @@ -951,11 +1446,19 @@ static inline void _transpose_mxn_half_16_16(__m256i t[], __m512i u[]) { u[i + 1] = _mm512_unpackhi_epi32(r[i], r[i + 1]); } +<<<<<<< HEAD // r0: a0a1 b0b1 c0c1 d0d1 a8a9 b8b9 c8c9 d8d9 e0e1 f0f1 g0g1 h0h1 e8e9 f8f9 g8g9 h8h9 // r1: a2a3 b2b3 c2c3 d2d3 a10a11 b10b11 c10c11 d10d11 e2e3 f2f3 g2g3 h2h3 e10e11 f10f11 g10g11 h10h11 // r2: a4a5 b4b5 c4c5 d4b5 a12a13 b12b13 c12c13 d12d13 // r3: a6a7 b6b7 c6c7 d6b7 a14a15 b14b15 c14c15 d14d15 // r4: i j k l m n o p +======= + // r0: a0a1 b0b1 c0c1 d0d1 a8a9 b8b9 c8c9 d8d9 e0e1 f0f1 g0g1 h0h1 e8e9 f8f9 + // g8g9 h8h9 r1: a2a3 b2b3 c2c3 d2d3 a10a11 b10b11 c10c11 d10d11 e2e3 f2f3 + // g2g3 h2h3 e10e11 f10f11 g10g11 h10h11 r2: a4a5 b4b5 c4c5 d4b5 a12a13 b12b13 + // c12c13 d12d13 r3: a6a7 b6b7 c6c7 d6b7 a14a15 b14b15 c14c15 d14d15 r4: i j k + // l m n o p +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) r[0] = _mm512_unpacklo_epi64(u[0], u[2]); r[1] = _mm512_unpackhi_epi64(u[0], u[2]); r[2] = _mm512_unpacklo_epi64(u[1], u[3]); @@ -1020,7 +1523,11 @@ static inline void _transpose_mxn_half_16_16(__m256i t[], __m512i u[]) { // TODO(Leslie): Add the AVX2 Version of transpose_mxn for BFloat16 and Float16 // Code referred to FBGEMM: // https://github.com/pytorch/FBGEMM/blob/39a423e4ad1a04b77fea81c7d09c3e6f8984fae9/src/UtilsAvx512.cc#L1483-L1607 +<<<<<<< HEAD template<> +======= +template <> +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) inline void transpose_mxn( const BFloat16* src, int64_t ld_src, @@ -1048,7 +1555,12 @@ inline void transpose_mxn( #pragma unroll(16) #endif for (int i = 0; i < 16; i++) { +<<<<<<< HEAD t[i] = _mm256_loadu_si256(reinterpret_cast(src + i * ld_src)); +======= + t[i] = + _mm256_loadu_si256(reinterpret_cast(src + i * ld_src)); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } __m512i u[8]; @@ -1059,8 +1571,13 @@ inline void transpose_mxn( #endif for (int i = 0; i < 8; i++) { _mm256_storeu_si256( +<<<<<<< HEAD reinterpret_cast<__m256i*>(dst + (i * 2) * ld_dst), _mm512_extracti32x8_epi32(u[i], 0x0)); +======= + reinterpret_cast<__m256i*>(dst + (i * 2) * ld_dst), + _mm512_extracti32x8_epi32(u[i], 0x0)); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) _mm256_storeu_si256( reinterpret_cast<__m256i*>(dst + (i * 2 + 1) * ld_dst), _mm512_extracti32x8_epi32(u[i], 0x01)); @@ -1069,7 +1586,11 @@ inline void transpose_mxn( // Code referred to FBGEMM: // https://github.com/pytorch/FBGEMM/blob/39a423e4ad1a04b77fea81c7d09c3e6f8984fae9/src/UtilsAvx512.cc#L1483-L1607 +<<<<<<< HEAD template<> +======= +template <> +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) inline void transpose_mxn( const Half* src, int64_t ld_src, @@ -1082,7 +1603,12 @@ inline void transpose_mxn( #pragma unroll(16) #endif for (int i = 0; i < 16; i++) { +<<<<<<< HEAD t[i] = _mm256_loadu_si256(reinterpret_cast(src + i * ld_src)); +======= + t[i] = + _mm256_loadu_si256(reinterpret_cast(src + i * ld_src)); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } __m512i u[8]; @@ -1093,8 +1619,13 @@ inline void transpose_mxn( #endif for (int i = 0; i < 8; i++) { _mm256_storeu_si256( +<<<<<<< HEAD reinterpret_cast<__m256i*>(dst + (i * 2) * ld_dst), _mm512_extracti32x8_epi32(u[i], 0x0)); +======= + reinterpret_cast<__m256i*>(dst + (i * 2) * ld_dst), + _mm512_extracti32x8_epi32(u[i], 0x0)); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) _mm256_storeu_si256( reinterpret_cast<__m256i*>(dst + (i * 2 + 1) * ld_dst), _mm512_extracti32x8_epi32(u[i], 0x01)); @@ -1106,6 +1637,7 @@ static inline void _transpose_mxn_half_32_32(__m512i r[], __m512i d[]) { // t[1]: 4 36 5 37 6 38 7 39 12 44 13 45 14 46 15 47 20 ... 63 // t[2]: 64 96 65 97 66 98 67 99 72 104 73 105 74 106 75 ... 123 // t[3]: 68 100 69 101 70 102 71 103 76 108 77 109 78 110 79 111 84 ... 127 +<<<<<<< HEAD // t[4]: 128 160 129 161 130 162 131 163 136 168 137 169 138 170 139 171 144 ... 187 // t[5]: 132 164 133 165 134 166 135 167 140 172 141 173 142 174 143 175 148 ... 191 // t[6]: 192 224 193 225 194 226 195 227 200 232 201 233 202 234 203 235 208 ... 251 @@ -1121,6 +1653,26 @@ static inline void _transpose_mxn_half_32_32(__m512i r[], __m512i d[]) { // t[16]: 512 544 513 545 514 546 515 547 520 552 521 553 522 554 523 555 528 ... 571 // ... // t[31]: 964 996 965 997 966 998 967 999 972 1004 973 1005 974 1006 975 1007 980 ... 1023 +======= + // t[4]: 128 160 129 161 130 162 131 163 136 168 137 169 138 170 139 171 144 + // ... 187 t[5]: 132 164 133 165 134 166 135 167 140 172 141 173 142 174 143 + // 175 148 ... 191 t[6]: 192 224 193 225 194 226 195 227 200 232 201 233 202 + // 234 203 235 208 ... 251 t[7]: 196 228 197 229 198 230 199 231 204 236 205 + // 237 206 238 207 239 212 ... 255 t[8]: 256 288 257 289 258 290 259 291 264 + // 296 265 297 266 298 267 299 272 ... 315 t[9]: 260 292 261 293 262 294 263 + // 295 268 300 269 301 270 302 271 303 276 ... 319 t[10]: 320 352 321 353 322 + // 354 323 355 328 360 329 361 330 362 331 363 336 ... 379 t[11]: 324 356 325 + // 357 326 358 327 359 332 364 333 365 334 366 335 367 340 ... 383 t[12]: 384 + // 416 385 417 386 418 387 419 392 424 393 425 394 426 395 427 400 ... 443 + // t[13]: 388 420 389 421 390 422 391 423 396 428 397 429 398 430 399 431 404 + // ... 447 t[14]: 448 480 449 481 450 482 451 483 456 488 457 489 458 490 459 + // 491 464 ... 507 t[15]: 452 484 453 485 454 486 455 487 460 492 461 493 462 + // 494 463 495 468 ... 511 t[16]: 512 544 513 545 514 546 515 547 520 552 521 + // 553 522 554 523 555 528 ... 571 + // ... + // t[31]: 964 996 965 997 966 998 967 999 972 1004 973 1005 974 1006 975 1007 + // 980 ... 1023 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #ifndef __msvc_cl__ #pragma unroll(16) #endif @@ -1133,6 +1685,7 @@ static inline void _transpose_mxn_half_32_32(__m512i r[], __m512i d[]) { // t[1]: 2 34 66 98 3 35 67 99 10 42 74 106 11 43 75 107 18 ... 123 // t[2]: 4 36 68 100 5 37 69 101 12 44 76 108 13 45 77 109 20 ... 125 // t[3]: 6 38 70 102 7 39 71 103 14 46 78 110 15 47 79 111 22 ... 127 +<<<<<<< HEAD // t[4]: 128 160 192 224 129 161 193 225 136 168 200 232 137 169 201 233 144 ... 249 // t[5]: 130 162 194 226 131 163 195 227 138 170 202 234 139 171 203 235 146 ... 251 // t[6]: 132 164 196 228 133 165 197 229 140 172 204 236 141 173 205 237 148 ... 253 @@ -1148,6 +1701,26 @@ static inline void _transpose_mxn_half_32_32(__m512i r[], __m512i d[]) { // t[16]: 512 544 576 608 513 545 577 609 520 552 584 616 521 553 585 617 528 ... 633 // ... // t[31]: 902 934 966 998 903 935 967 999 910 942 974 1006 911 943 975 1007 918 ... 1023 +======= + // t[4]: 128 160 192 224 129 161 193 225 136 168 200 232 137 169 201 233 144 + // ... 249 t[5]: 130 162 194 226 131 163 195 227 138 170 202 234 139 171 203 + // 235 146 ... 251 t[6]: 132 164 196 228 133 165 197 229 140 172 204 236 141 + // 173 205 237 148 ... 253 t[7]: 134 166 198 230 135 167 199 231 142 174 206 + // 238 143 175 207 239 150 ... 255 t[8]: 256 288 320 352 257 289 321 353 264 + // 296 328 360 265 297 329 361 272 ... 377 t[9]: 258 290 322 354 259 291 323 + // 355 266 298 330 362 267 299 331 363 274 ... 379 t[10]: 260 292 324 356 261 + // 293 325 357 268 300 332 364 269 301 333 365 276 ... 381 t[11]: 262 294 326 + // 358 263 295 327 359 270 302 334 366 271 303 335 367 278 ... 383 t[12]: 384 + // 416 448 480 385 417 449 481 392 424 456 488 393 425 457 489 400 ... 505 + // t[13]: 386 418 450 482 387 419 451 483 394 426 458 490 395 427 459 491 402 + // ... 507 t[14]: 388 420 452 484 389 421 453 485 396 428 460 492 397 429 461 + // 493 404 ... 509 t[15]: 390 422 454 486 391 423 455 487 398 430 462 494 399 + // 431 463 495 406 ... 511 t[16]: 512 544 576 608 513 545 577 609 520 552 584 + // 616 521 553 585 617 528 ... 633 + // ... + // t[31]: 902 934 966 998 903 935 967 999 910 942 974 1006 911 943 975 1007 + // 918 ... 1023 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #ifndef __msvc_cl__ #pragma unroll(8) #endif @@ -1166,6 +1739,7 @@ static inline void _transpose_mxn_half_32_32(__m512i r[], __m512i d[]) { // t[5]: 5 37 69 101 133 165 197 229 13 45 77 109 141 173 205 237 21 ... 253 // t[6]: 6 38 70 102 134 166 198 230 14 46 78 110 142 174 206 238 22 ... 254 // t[7]: 7 39 71 103 135 167 199 231 15 47 79 111 143 175 207 239 23 ... 255 +<<<<<<< HEAD // t[8]: 256 288 320 352 384 416 448 480 264 296 328 360 392 424 456 488 272 ... 504 // t[9]: 257 289 321 353 385 417 449 481 265 297 329 361 393 425 457 489 273 ... 505 // t[10]: 258 290 322 354 386 418 450 482 266 298 330 362 394 426 458 490 274 ... 506 @@ -1177,6 +1751,21 @@ static inline void _transpose_mxn_half_32_32(__m512i r[], __m512i d[]) { // t[16]: 512 544 576 608 640 672 704 736 520 552 584 616 648 680 712 744 528 ... 760 // ... // t[31]: 775 807 839 871 903 935 967 999 783 815 847 879 911 943 975 1007 791 ... 1023 +======= + // t[8]: 256 288 320 352 384 416 448 480 264 296 328 360 392 424 456 488 272 + // ... 504 t[9]: 257 289 321 353 385 417 449 481 265 297 329 361 393 425 457 + // 489 273 ... 505 t[10]: 258 290 322 354 386 418 450 482 266 298 330 362 394 + // 426 458 490 274 ... 506 t[11]: 259 291 323 355 387 419 451 483 267 299 331 + // 363 395 427 459 491 275 ... 507 t[12]: 260 292 324 356 388 420 452 484 268 + // 300 332 364 396 428 460 492 276 ... 508 t[13]: 261 293 325 357 389 421 453 + // 485 269 301 333 365 397 429 461 493 277 ... 509 t[14]: 262 294 326 358 390 + // 422 454 486 270 302 334 366 398 430 462 494 278 ... 510 t[15]: 263 295 327 + // 359 391 423 455 487 271 303 335 367 399 431 463 495 279 ... 511 t[16]: 512 + // 544 576 608 640 672 704 736 520 552 584 616 648 680 712 744 528 ... 760 + // ... + // t[31]: 775 807 839 871 903 935 967 999 783 815 847 879 911 943 975 1007 791 + // ... 1023 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #ifndef __msvc_cl__ #pragma unroll(4) #endif @@ -1195,6 +1784,7 @@ static inline void _transpose_mxn_half_32_32(__m512i r[], __m512i d[]) { // t[1]: 1 33 65 97 129 161 193 225 257 289 321 353 385 417 449 481 17 ... 497 // t[2]: 2 34 66 98 130 162 194 226 258 290 322 354 386 418 450 482 18 ... 498 // t[3]: 3 35 67 99 131 163 195 227 259 291 323 355 387 419 451 483 19 ... 499 +<<<<<<< HEAD // t[4]: 4 36 68 100 132 164 196 228 260 292 324 356 388 420 452 484 20 ... 500 // t[5]: 5 37 69 101 133 165 197 229 261 293 325 357 389 421 453 485 21 ... 501 // t[6]: 6 38 70 102 134 166 198 230 262 294 326 358 390 422 454 486 22 ... 502 @@ -1210,6 +1800,25 @@ static inline void _transpose_mxn_half_32_32(__m512i r[], __m512i d[]) { // t[16]: 512 544 576 608 640 672 704 736 768 800 832 864 896 928 960 992 528 ... 1008 // ... // t[31]: 527 559 591 623 655 687 719 751 783 815 847 879 911 943 975 1007 543 ... 1023 +======= + // t[4]: 4 36 68 100 132 164 196 228 260 292 324 356 388 420 452 484 20 ... + // 500 t[5]: 5 37 69 101 133 165 197 229 261 293 325 357 389 421 453 485 21 + // ... 501 t[6]: 6 38 70 102 134 166 198 230 262 294 326 358 390 422 454 486 + // 22 ... 502 t[7]: 7 39 71 103 135 167 199 231 263 295 327 359 391 423 455 + // 487 23 ... 503 t[8]: 8 40 72 104 136 168 200 232 264 296 328 360 392 424 + // 456 488 24 ... 504 t[9]: 9 41 73 105 137 169 201 233 265 297 329 361 393 + // 425 457 489 25 ... 505 t[10]: 10 42 74 106 138 170 202 234 266 298 330 362 + // 394 426 458 490 26 ... 506 t[11]: 11 43 75 107 139 171 203 235 267 299 331 + // 363 395 427 459 491 27 ... 507 t[12]: 12 44 76 108 140 172 204 236 268 300 + // 332 364 396 428 460 492 28 ... 508 t[13]: 13 45 77 109 141 173 205 237 269 + // 301 333 365 397 429 461 493 29 ... 509 t[14]: 14 46 78 110 142 174 206 238 + // 270 302 334 366 398 430 462 494 30 ... 510 t[15]: 15 47 79 111 143 175 207 + // 239 271 303 335 367 399 431 463 495 31 ... 511 t[16]: 512 544 576 608 640 + // 672 704 736 768 800 832 864 896 928 960 992 528 ... 1008 + // ... + // t[31]: 527 559 591 623 655 687 719 751 783 815 847 879 911 943 975 1007 543 + // ... 1023 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) __m512i const1 = _mm512_set_epi64( 0x000000000000000d, 0x000000000000000c, @@ -1232,6 +1841,7 @@ static inline void _transpose_mxn_half_32_32(__m512i r[], __m512i d[]) { #pragma unroll(8) #endif for (int i = 0; i < 8; ++i) { +<<<<<<< HEAD r[i] = _mm512_permutex2var_epi64(d[i], /*idx*/const1, d[i + 8]); r[i + 8] = _mm512_permutex2var_epi64(d[i], /*idx*/const2, d[i + 8]); r[i + 16] = _mm512_permutex2var_epi64(d[i + 16], /*idx*/const1, d[i + 24]); @@ -1257,6 +1867,37 @@ static inline void _transpose_mxn_half_32_32(__m512i r[], __m512i d[]) { // t[16]: 16 48 80 112 144 176 208 240 272 304 336 368 400 432 464 496 528 560 ... 1008 // ... // t[31]: 31 63 95 127 159 191 223 255 287 319 351 383 415 447 479 511 543 575 ... 1023 +======= + r[i] = _mm512_permutex2var_epi64(d[i], /*idx*/ const1, d[i + 8]); + r[i + 8] = _mm512_permutex2var_epi64(d[i], /*idx*/ const2, d[i + 8]); + r[i + 16] = _mm512_permutex2var_epi64(d[i + 16], /*idx*/ const1, d[i + 24]); + r[i + 24] = _mm512_permutex2var_epi64(d[i + 16], /*idx*/ const2, d[i + 24]); + } + + // t[0]: 0 32 64 96 128 160 192 224 256 288 320 352 384 416 448 480 512 544 + // ... 992 t[1]: 1 33 65 97 129 161 193 225 257 289 321 353 385 417 449 481 + // 513 545 ... 993 t[2]: 2 34 66 98 130 162 194 226 258 290 322 354 386 418 + // 450 482 514 546 ... 994 t[3]: 3 35 67 99 131 163 195 227 259 291 323 355 + // 387 419 451 483 515 547 ... 995 t[4]: 4 36 68 100 132 164 196 228 260 292 + // 324 356 388 420 452 484 516 548 ... 996 t[5]: 5 37 69 101 133 165 197 229 + // 261 293 325 357 389 421 453 485 517 549 ... 997 t[6]: 6 38 70 102 134 166 + // 198 230 262 294 326 358 390 422 454 486 518 550 ... 998 t[7]: 7 39 71 103 + // 135 167 199 231 263 295 327 359 391 423 455 487 519 551 ... 999 t[8]: 8 40 + // 72 104 136 168 200 232 264 296 328 360 392 424 456 488 520 552 ... 1000 + // t[9]: 9 41 73 105 137 169 201 233 265 297 329 361 393 425 457 489 521 553 + // ... 1001 t[10]: 10 42 74 106 138 170 202 234 266 298 330 362 394 426 458 + // 490 522 554 ... 1002 t[11]: 11 43 75 107 139 171 203 235 267 299 331 363 + // 395 427 459 491 523 555 ... 1003 t[12]: 12 44 76 108 140 172 204 236 268 + // 300 332 364 396 428 460 492 524 556 ... 1004 t[13]: 13 45 77 109 141 173 + // 205 237 269 301 333 365 397 429 461 493 525 557 ... 1005 t[14]: 14 46 78 + // 110 142 174 206 238 270 302 334 366 398 430 462 494 526 558 ... 1006 t[15]: + // 15 47 79 111 143 175 207 239 271 303 335 367 399 431 463 495 527 559 ... + // 1007 t[16]: 16 48 80 112 144 176 208 240 272 304 336 368 400 432 464 496 + // 528 560 ... 1008 + // ... + // t[31]: 31 63 95 127 159 191 223 255 287 319 351 383 415 447 479 511 543 575 + // ... 1023 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) __m512i const3 = _mm512_set_epi64( 0x000000000000000b, 0x000000000000000a, @@ -1279,17 +1920,36 @@ static inline void _transpose_mxn_half_32_32(__m512i r[], __m512i d[]) { #pragma unroll(16) #endif for (int i = 0; i < 16; ++i) { +<<<<<<< HEAD d[i] = _mm512_permutex2var_epi64(r[i], /*idx*/const3, r[i + 16]); d[i + 16] = _mm512_permutex2var_epi64(r[i], /*idx*/const4, r[i + 16]); +======= + d[i] = _mm512_permutex2var_epi64(r[i], /*idx*/ const3, r[i + 16]); + d[i + 16] = _mm512_permutex2var_epi64(r[i], /*idx*/ const4, r[i + 16]); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } } // Code referred to FBGEMM: // https://github.com/pytorch/FBGEMM/blob/39a423e4ad1a04b77fea81c7d09c3e6f8984fae9/src/UtilsAvx512.cc#LL19C6-L19C6 +<<<<<<< HEAD template<> inline void transpose_mxn(const BFloat16* src, int64_t ld_src, BFloat16* dst, int64_t ld_dst, int M, int N) { // load from src TORCH_CHECK(M <= 32 && N <= 32, "transpose_mxn expects M, N <= 32."); +======= +template <> +inline void transpose_mxn( + const BFloat16* src, + int64_t ld_src, + BFloat16* dst, + int64_t ld_dst, + int M, + int N) { + // load from src + TORCH_CHECK( + M <= 32 && N <= 32, "transpose_mxn expects M, N <= 32."); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) __m512i r[32]; int i; if (N == 32) { @@ -1322,6 +1982,7 @@ inline void transpose_mxn(const BFloat16* src, int64_t ld_src, BFloat1 } } +<<<<<<< HEAD template && ((M <= 32 && M != 16) || (N <= 32 && N != 16)), int> = 0> inline void transpose_mxn(const BFloat16* src, int64_t ld_src, BFloat16* dst, int64_t ld_dst) { @@ -1330,6 +1991,32 @@ inline void transpose_mxn(const BFloat16* src, int64_t ld_src, BFloat16* dst, in template<> inline void transpose_mxn(const Half* src, int64_t ld_src, Half* dst, int64_t ld_dst, int M, int N) { +======= +template < + typename T, + int M, + int N, + typename std::enable_if_t< + std::is_same_v && + ((M <= 32 && M != 16) || (N <= 32 && N != 16)), + int> = 0> +inline void transpose_mxn( + const BFloat16* src, + int64_t ld_src, + BFloat16* dst, + int64_t ld_dst) { + transpose_mxn(src, ld_src, dst, ld_dst, M, N); +} + +template <> +inline void transpose_mxn( + const Half* src, + int64_t ld_src, + Half* dst, + int64_t ld_dst, + int M, + int N) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) TORCH_CHECK(M <= 32 && N <= 32, "transpose_mxn expects M, N <= 32."); // load from src __m512i r[32]; @@ -1364,15 +2051,39 @@ inline void transpose_mxn(const Half* src, int64_t ld_src, Half* dst, int6 } } +<<<<<<< HEAD template && ((M <= 32 && M != 16) || (N <= 32 && N != 16)), int> = 0> inline void transpose_mxn(const Half* src, int64_t ld_src, Half* dst, int64_t ld_dst) { +======= +template < + typename T, + int M, + int N, + typename std::enable_if_t< + std::is_same_v && + ((M <= 32 && M != 16) || (N <= 32 && N != 16)), + int> = 0> +inline void transpose_mxn( + const Half* src, + int64_t ld_src, + Half* dst, + int64_t ld_dst) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) transpose_mxn(src, ld_src, dst, ld_dst, M, N); } template <> +<<<<<<< HEAD class Vectorized: public Vectorized16 { public: +======= +struct is_vec_specialized_for : std::bool_constant {}; + +template <> +class Vectorized : public Vectorized16 { + public: +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) using Vectorized16::Vectorized16; using value_type = Half; @@ -1387,6 +2098,7 @@ class Vectorized: public Vectorized16 { Vectorized le(const Vectorized& other) const; }; +<<<<<<< HEAD Vectorized inline operator+(const Vectorized& a, const Vectorized& b) { return binary_op_as_fp32(a, b, [](const __m512& x, const __m512& y) { return _mm512_add_ps(x, y); }); } @@ -1431,6 +2143,80 @@ inline Vectorized Vectorized::lt(const Vectorized& other) cons } inline Vectorized Vectorized::le(const Vectorized& other) const { +======= +Vectorized inline operator+( + const Vectorized& a, + const Vectorized& b) { + return binary_op_as_fp32(a, b, [](const __m512& x, const __m512& y) { + return _mm512_add_ps(x, y); + }); +} +Vectorized inline operator-( + const Vectorized& a, + const Vectorized& b) { + return binary_op_as_fp32(a, b, [](const __m512& x, const __m512& y) { + return _mm512_sub_ps(x, y); + }); +} +Vectorized inline operator*( + const Vectorized& a, + const Vectorized& b) { + return binary_op_as_fp32(a, b, [](const __m512& x, const __m512& y) { + return _mm512_mul_ps(x, y); + }); +} +Vectorized inline operator/( + const Vectorized& a, + const Vectorized& b) { + return binary_op_as_fp32(a, b, [](const __m512& x, const __m512& y) { + return _mm512_div_ps(x, y); + }); +} + +Vectorized inline operator&( + const Vectorized& a, + const Vectorized& b) { + return _mm512_and_si512(a, b); +} +Vectorized inline operator|( + const Vectorized& a, + const Vectorized& b) { + return _mm512_or_si512(a, b); +} +Vectorized inline operator^( + const Vectorized& a, + const Vectorized& b) { + return _mm512_xor_si512(a, b); +} + +inline Vectorized Vectorized::eq( + const Vectorized& other) const { + return (*this == other) & Vectorized(1.0f); +} + +inline Vectorized Vectorized::ne( + const Vectorized& other) const { + return (*this != other) & Vectorized(1.0f); +} + +inline Vectorized Vectorized::gt( + const Vectorized& other) const { + return (*this > other) & Vectorized(1.0f); +} + +inline Vectorized Vectorized::ge( + const Vectorized& other) const { + return (*this >= other) & Vectorized(1.0f); +} + +inline Vectorized Vectorized::lt( + const Vectorized& other) const { + return (*this < other) & Vectorized(1.0f); +} + +inline Vectorized Vectorized::le( + const Vectorized& other) const { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return (*this <= other) & Vectorized(1.0f); } @@ -1442,7 +2228,13 @@ inline Vectorized Vectorized::frac() const { // Implements the IEEE 754 201X `maximum` operation, which propagates NaN if // either input is a NaN. template <> +<<<<<<< HEAD Vectorized inline maximum(const Vectorized& a, const Vectorized& b) { +======= +Vectorized inline maximum( + const Vectorized& a, + const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) __m512 a_lo, a_hi; __m512 b_lo, b_hi; cvtfp16_fp32(__m512i(a), a_lo, a_hi); @@ -1462,7 +2254,13 @@ Vectorized inline maximum(const Vectorized& a, const Vectorized +<<<<<<< HEAD Vectorized inline minimum(const Vectorized& a, const Vectorized& b) { +======= +Vectorized inline minimum( + const Vectorized& a, + const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) __m512 a_lo, a_hi; __m512 b_lo, b_hi; __m512i zero_vec = _mm512_set1_epi32(0); @@ -1472,10 +2270,17 @@ Vectorized inline minimum(const Vectorized& a, const Vectorized>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // Exploit the fact that all-ones is a NaN. auto o1 = _mm512_or_ps(min_lo, nan_lo); auto o2 = _mm512_or_ps(min_hi, nan_hi); @@ -1483,8 +2288,15 @@ Vectorized inline minimum(const Vectorized& a, const Vectorized +<<<<<<< HEAD Vectorized inline clamp(const Vectorized& a, const Vectorized& min, const Vectorized& max) { +======= +Vectorized inline clamp( + const Vectorized& a, + const Vectorized& min, + const Vectorized& max) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) __m512 a_lo, a_hi; __m512 min_lo, min_hi; __m512 max_lo, max_hi; @@ -1497,7 +2309,13 @@ Vectorized inline clamp(const Vectorized& a, } template <> +<<<<<<< HEAD Vectorized inline clamp_max(const Vectorized& a, const Vectorized& max) { +======= +Vectorized inline clamp_max( + const Vectorized& a, + const Vectorized& max) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) __m512 a_lo, a_hi; __m512 max_lo, max_hi; cvtfp16_fp32(__m512i(a), a_lo, a_hi); @@ -1508,7 +2326,13 @@ Vectorized inline clamp_max(const Vectorized& a, const Vectorized +<<<<<<< HEAD Vectorized inline clamp_min(const Vectorized& a, const Vectorized& min) { +======= +Vectorized inline clamp_min( + const Vectorized& a, + const Vectorized& min) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) __m512 a_lo, a_hi; __m512 min_lo, min_hi; cvtfp16_fp32(__m512i(a), a_lo, a_hi); @@ -1524,8 +2348,15 @@ inline void convert(const Half* src, Half* dst, int64_t n) { #ifndef __msvc_cl__ #pragma unroll #endif +<<<<<<< HEAD for (i = 0; i <= (n - Vectorized::size()); i += Vectorized::size()) { auto vsrc = _mm512_loadu_si512(reinterpret_cast<__m512i*>((void*)(src + i))); +======= + for (i = 0; i <= (n - Vectorized::size()); + i += Vectorized::size()) { + auto vsrc = + _mm512_loadu_si512(reinterpret_cast<__m512i*>((void*)(src + i))); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) _mm512_storeu_si512(reinterpret_cast<__m512i*>((void*)(dst + i)), vsrc); } #ifndef __msvc_cl__ @@ -1539,7 +2370,12 @@ inline void convert(const Half* src, Half* dst, int64_t n) { template <> inline void convert(const float* src, Half* dst, int64_t n) { int64_t i; +<<<<<<< HEAD for (i = 0; i + Vectorized::size() <= n; i += Vectorized::size()) { +======= + for (i = 0; i + Vectorized::size() <= n; + i += Vectorized::size()) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) __m512 a = _mm512_loadu_ps(&src[i]); __m512 b = _mm512_loadu_ps(&src[i + 16]); @@ -1553,7 +2389,11 @@ inline void convert(const float* src, Half* dst, int64_t n) { template <> inline void convert(const double* src, Half* dst, int64_t n) { +<<<<<<< HEAD auto load_float = [](const double *src) -> __m512 { +======= + auto load_float = [](const double* src) -> __m512 { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // Load one float vector from an array of doubles __m256 a = _mm512_cvtpd_ps(_mm512_loadu_pd(src)); __m256 b = _mm512_cvtpd_ps(_mm512_loadu_pd(src + 8)); @@ -1561,7 +2401,12 @@ inline void convert(const double* src, Half* dst, int64_t n) { }; int64_t i; +<<<<<<< HEAD for (i = 0; i + Vectorized::size() <= n; i += Vectorized::size()) { +======= + for (i = 0; i + Vectorized::size() <= n; + i += Vectorized::size()) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) __m512 a = load_float(&src[i]); __m512 b = load_float(&src[i + 16]); @@ -1574,8 +2419,15 @@ inline void convert(const double* src, Half* dst, int64_t n) { } template <> +<<<<<<< HEAD Vectorized inline fmadd(const Vectorized& a, const Vectorized& b, const Vectorized& c) { +======= +Vectorized inline fmadd( + const Vectorized& a, + const Vectorized& b, + const Vectorized& c) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) __m512 a_lo, a_hi; __m512 b_lo, b_hi; __m512 c_lo, c_hi; @@ -1587,6 +2439,7 @@ Vectorized inline fmadd(const Vectorized& a, return cvtfp32_fp16(o1, o2); } +<<<<<<< HEAD #define CONVERT_VECTORIZED_INIT(type, name) \ inline std::tuple, Vectorized> convert_##name##_float(const Vectorized& a) { \ __m512 o1, o2; \ @@ -1627,12 +2480,59 @@ inline Vectorized convert_float_##name(const Vectorized& a, const V } \ return Vectorized::loadu(arr2); \ } +======= +#define CONVERT_VECTORIZED_INIT(type, name) \ + inline std::tuple, Vectorized> \ + convert_##name##_float(const Vectorized& a) { \ + __m512 o1, o2; \ + cvt_to_fp32(__m512i(a), o1, o2); \ + return std::make_tuple(o1, o2); \ + } \ + \ + inline Vectorized convert_float_##name( \ + const Vectorized& a, const Vectorized& b) { \ + return cvt_from_fp32(__m512(a), __m512(b)); \ + } +CONVERT_VECTORIZED_INIT(BFloat16, bfloat16) +CONVERT_VECTORIZED_INIT(Half, half) + +#else // defined(CPU_CAPABILITY_AVX512) + +#define CONVERT_NON_VECTORIZED_INIT(type, name) \ + inline std::tuple, Vectorized> \ + convert_##name##_float(const Vectorized& a) { \ + constexpr int64_t K = Vectorized::size(); \ + __at_align__ float arr[K]; \ + __at_align__ type arr2[K]; \ + a.store(arr2); \ + for (const auto k : c10::irange(K)) { \ + arr[k] = c10::convert(arr2[k]); \ + } \ + return std::make_tuple( \ + Vectorized::loadu(arr), \ + Vectorized::loadu(arr + Vectorized::size())); \ + } \ + \ + inline Vectorized convert_float_##name( \ + const Vectorized& a, const Vectorized& b) { \ + constexpr int64_t K = Vectorized::size(); \ + __at_align__ float arr[K]; \ + __at_align__ type arr2[K]; \ + a.store(arr); \ + b.store(arr + Vectorized::size()); \ + for (const auto k : c10::irange(K)) { \ + arr2[k] = c10::convert(arr[k]); \ + } \ + return Vectorized::loadu(arr2); \ + } +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) CONVERT_NON_VECTORIZED_INIT(BFloat16, bfloat16) CONVERT_NON_VECTORIZED_INIT(Half, half) #endif // defined(CPU_CAPABILITY_AVX512) #if defined(CPU_CAPABILITY_AVX512) +<<<<<<< HEAD #define LOAD_FP32_VECTORIZED_INIT(type, name) \ inline void load_fp32_from_##name(const type *data, Vectorized& out) { \ auto values = _mm256_loadu_si256(reinterpret_cast(data)); \ @@ -1648,10 +2548,30 @@ inline void load_fp32_from_##name(const type *data, Vectorized& out1, Vec out1 = out1_values; \ out2 = out2_values; \ } +======= +#define LOAD_FP32_VECTORIZED_INIT(type, name) \ + inline void load_fp32_from_##name( \ + const type* data, Vectorized& out) { \ + auto values = _mm256_loadu_si256(reinterpret_cast(data)); \ + __m512 out_values; \ + cvt_to_fp32(values, out_values); \ + out = out_values; \ + } \ + \ + inline void load_fp32_from_##name( \ + const type* data, Vectorized& out1, Vectorized& out2) { \ + auto vec = Vectorized::loadu(data); \ + __m512 out1_values, out2_values; \ + cvt_to_fp32(vec, out1_values, out2_values); \ + out1 = out1_values; \ + out2 = out2_values; \ + } +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) LOAD_FP32_VECTORIZED_INIT(BFloat16, bf16) LOAD_FP32_VECTORIZED_INIT(Half, fp16) #else // defined(CPU_CAPABILITY_AVX512) +<<<<<<< HEAD #define LOAD_FP32_NON_VECTORIZED_INIT(type, name) \ inline void load_fp32_from_##name(const type *data, Vectorized& out) { \ __at_align__ float values[Vectorized::size()]; \ @@ -1666,8 +2586,31 @@ inline void load_fp32_from_##name(const type *data, Vectorized& out1, Vec data += Vectorized::size(); \ load_fp32_from_##name(data, out2); \ } +======= +#define LOAD_FP32_NON_VECTORIZED_INIT(type, name) \ + inline void load_fp32_from_##name( \ + const type* data, Vectorized& out) { \ + __at_align__ float values[Vectorized::size()]; \ + for (const auto k : c10::irange(Vectorized::size())) { \ + values[k] = data[k]; \ + } \ + out = Vectorized::loadu(values); \ + } \ + \ + inline void load_fp32_from_##name( \ + const type* data, Vectorized& out1, Vectorized& out2) { \ + load_fp32_from_##name(data, out1); \ + data += Vectorized::size(); \ + load_fp32_from_##name(data, out2); \ + } +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) LOAD_FP32_NON_VECTORIZED_INIT(BFloat16, bf16) LOAD_FP32_NON_VECTORIZED_INIT(Half, fp16) #endif +<<<<<<< HEAD }} +======= +} // namespace CPU_CAPABILITY +} // namespace at::vec +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_complex_double.h b/aten/src/ATen/cpu/vec/vec512/vec512_complex_double.h index 444b41cfb7e5..68e93034cb5f 100644 --- a/aten/src/ATen/cpu/vec/vec512/vec512_complex_double.h +++ b/aten/src/ATen/cpu/vec/vec512/vec512_complex_double.h @@ -3,27 +3,51 @@ // DO NOT DEFINE STATIC DATA IN THIS HEADER! // See Note [Do not compile initializers with AVX] +<<<<<<< HEAD #include #include #include #include +======= +#include +#include +#include +#include +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #if defined(CPU_CAPABILITY_AVX512) #define SLEEF_STATIC_LIBS #include #endif +<<<<<<< HEAD +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) namespace at::vec { // See Note [CPU_CAPABILITY namespace] inline namespace CPU_CAPABILITY { #if defined(CPU_CAPABILITY_AVX512) +<<<<<<< HEAD template <> class Vectorized> { private: __m512d values; static constexpr __m512i zero_vector {0, 0, 0, 0, 0, 0, 0, 0}; public: +======= +template <> +struct is_vec_specialized_for> : std::bool_constant { +}; + +template <> +class Vectorized> { + private: + __m512d values; + static constexpr __m512i zero_vector{0, 0, 0, 0, 0, 0, 0, 0}; + + public: +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) using value_type = c10::complex; using size_type = int; static constexpr size_type size() { @@ -34,6 +58,7 @@ template <> class Vectorized> { Vectorized(c10::complex val) { double real_value = val.real(); double imag_value = val.imag(); +<<<<<<< HEAD values = _mm512_setr_pd(real_value, imag_value, real_value, imag_value, real_value, imag_value, real_value, imag_value); } @@ -43,19 +68,53 @@ template <> class Vectorized> { val2.real(), val2.imag(), val3.real(), val3.imag(), val4.real(), val4.imag()); +======= + values = _mm512_setr_pd( + real_value, + imag_value, + real_value, + imag_value, + real_value, + imag_value, + real_value, + imag_value); + } + Vectorized( + c10::complex val1, + c10::complex val2, + c10::complex val3, + c10::complex val4) { + values = _mm512_setr_pd( + val1.real(), + val1.imag(), + val2.real(), + val2.imag(), + val3.real(), + val3.imag(), + val4.real(), + val4.imag()); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } operator __m512d() const { return values; } template +<<<<<<< HEAD static Vectorized> blend(const Vectorized>& a, const Vectorized>& b) { // convert c10::complex index mask to V index mask: xy -> xxyy +======= + static Vectorized> blend( + const Vectorized>& a, + const Vectorized>& b) { + // convert c10::complex index mask to V index mask: xy -> xxyy +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // NOLINTNEXTLINE(clang-diagnostic-warning) switch (mask) { case 0: return a; case 1: +<<<<<<< HEAD return _mm512_mask_blend_pd(0x03, a.values, b.values); //b0000 0001 = b0000 0011 case 2: return _mm512_mask_blend_pd(0x0C, a.values, b.values); //b0000 0010 = b0000 1100 @@ -108,6 +167,80 @@ template <> class Vectorized> { static Vectorized> set(const Vectorized>& a, const Vectorized>& b, int64_t count = size()) { +======= + return _mm512_mask_blend_pd( + 0x03, a.values, b.values); // b0000 0001 = b0000 0011 + case 2: + return _mm512_mask_blend_pd( + 0x0C, a.values, b.values); // b0000 0010 = b0000 1100 + case 3: + return _mm512_mask_blend_pd( + 0x0F, a.values, b.values); // b0000 0011 = b0000 1111 + case 4: + return _mm512_mask_blend_pd( + 0x30, a.values, b.values); // b0000 0100 = b0011 0000 + case 5: + return _mm512_mask_blend_pd( + 0x33, a.values, b.values); // b0000 0101 = b0011 0011 + case 6: + return _mm512_mask_blend_pd( + 0x3C, a.values, b.values); // b0000 0110 = b0011 1100 + case 7: + return _mm512_mask_blend_pd( + 0x3F, a.values, b.values); // b0000 0111 = b0011 1111 + case 8: + return _mm512_mask_blend_pd( + 0xC0, a.values, b.values); // b0000 1000 = b1100 0000 + case 9: + return _mm512_mask_blend_pd( + 0xC3, a.values, b.values); // b0000 1001 = b1100 0011 + case 10: + return _mm512_mask_blend_pd( + 0xCC, a.values, b.values); // b0000 1010 = b1100 1100 + case 11: + return _mm512_mask_blend_pd( + 0xCF, a.values, b.values); // b0000 1011 = b1100 1111 + case 12: + return _mm512_mask_blend_pd( + 0xF0, a.values, b.values); // b0000 1100 = b1111 0000 + case 13: + return _mm512_mask_blend_pd( + 0xF3, a.values, b.values); // b0000 1101 = b1111 0011 + case 14: + return _mm512_mask_blend_pd( + 0xFC, a.values, b.values); // b0000 1110 = b1111 1100 + case 15: + return _mm512_mask_blend_pd( + 0xFF, a.values, b.values); // b0000 1111 = b1111 1111 + } + return b; + } + static Vectorized> blendv( + const Vectorized>& a, + const Vectorized>& b, + const Vectorized>& mask) { + // convert c10::complex index mask to V index mask: xy -> xxyy + auto mask_ = _mm512_unpacklo_pd(mask.values, mask.values); + auto all_ones = _mm512_set1_epi64(0xFFFFFFFFFFFFFFFF); + auto mmask = _mm512_cmp_epi64_mask( + _mm512_castpd_si512(mask_), all_ones, _MM_CMPINT_EQ); + return _mm512_mask_blend_pd(mmask, a.values, b.values); + } + template + static Vectorized> arange( + c10::complex base = 0., + step_t step = static_cast(1)) { + return Vectorized>( + base, + base + c10::complex(1) * step, + base + c10::complex(2) * step, + base + c10::complex(3) * step); + } + static Vectorized> set( + const Vectorized>& a, + const Vectorized>& b, + int64_t count = size()) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) switch (count) { case 0: return a; @@ -120,6 +253,7 @@ template <> class Vectorized> { } return b; } +<<<<<<< HEAD static Vectorized> loadu(const void* ptr, int64_t count = size()) { if (count == size()) return _mm512_loadu_pd(reinterpret_cast(ptr)); @@ -129,6 +263,20 @@ template <> class Vectorized> { // for more details. We do not initialize arrays to zero using "={0}" because gcc would compile it to two // instructions while a loop would be compiled to one instruction. for (const auto i : c10::irange(2*size())) { +======= + static Vectorized> loadu( + const void* ptr, + int64_t count = size()) { + if (count == size()) + return _mm512_loadu_pd(reinterpret_cast(ptr)); + + __at_align__ double tmp_values[2 * size()]; + // Ensure uninitialized memory does not change the output value See + // https://github.com/pytorch/pytorch/issues/32502 for more details. We do + // not initialize arrays to zero using "={0}" because gcc would compile it + // to two instructions while a loop would be compiled to one instruction. + for (const auto i : c10::irange(2 * size())) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) tmp_values[i] = 0.0; } std::memcpy( @@ -141,14 +289,25 @@ template <> class Vectorized> { if (count == size()) { _mm512_storeu_pd(reinterpret_cast(ptr), values); } else if (count > 0) { +<<<<<<< HEAD double tmp_values[2*size()]; +======= + double tmp_values[2 * size()]; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) _mm512_storeu_pd(reinterpret_cast(tmp_values), values); std::memcpy(ptr, tmp_values, count * sizeof(c10::complex)); } } +<<<<<<< HEAD const c10::complex& operator[](int idx) const = delete; c10::complex& operator[](int idx) = delete; Vectorized> map(c10::complex (*const f)(const c10::complex &)) const { +======= + const c10::complex& operator[](int idx) const = delete; + c10::complex& operator[](int idx) = delete; + Vectorized> map( + c10::complex (*const f)(const c10::complex&)) const { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) __at_align__ c10::complex tmp[size()]; store(tmp); for (const auto i : c10::irange(size())) { @@ -159,6 +318,7 @@ template <> class Vectorized> { // AVX512 doesn't have horizontal add & horizontal sub instructions. // TODO: hadd_pd() & hsub_pd() may have scope for improvement. static inline __m512d hadd_pd(__m512d a, __m512d b) { +<<<<<<< HEAD __m512i idx1 = _mm512_set_epi64(14, 6, 12, 4, 10, 2, 8, 0); __m512i idx2 = _mm512_set_epi64(15, 7, 13, 5, 11, 3, 9, 1); return _mm512_add_pd(_mm512_mask_permutex2var_pd(a, 0xff, idx1, b), @@ -199,6 +359,60 @@ template <> class Vectorized> { 0xFFFFFFFFFFFFFFFF, 0x0000000000000000)); auto angle = _mm512_permute_pd(angle_(), 0x55); // angle 90-angle return _mm512_and_pd(angle, real_mask); // angle 0 +======= + __m512i idx1 = _mm512_set_epi64(14, 6, 12, 4, 10, 2, 8, 0); + __m512i idx2 = _mm512_set_epi64(15, 7, 13, 5, 11, 3, 9, 1); + return _mm512_add_pd( + _mm512_mask_permutex2var_pd(a, 0xff, idx1, b), + _mm512_mask_permutex2var_pd(a, 0xff, idx2, b)); + } + static inline __m512d hsub_pd(__m512d a, __m512d b) { + __m512i idx1 = _mm512_set_epi64(14, 6, 12, 4, 10, 2, 8, 0); + __m512i idx2 = _mm512_set_epi64(15, 7, 13, 5, 11, 3, 9, 1); + return _mm512_sub_pd( + _mm512_mask_permutex2var_pd(a, 0xff, idx1, b), + _mm512_mask_permutex2var_pd(a, 0xff, idx2, b)); + } + __m512d abs_2_() const { + auto val_2 = _mm512_mul_pd(values, values); // a*a b*b + return hadd_pd(val_2, val_2); // a*a+b*b a*a+b*b + } + __m512d abs_() const { + auto real = _mm512_movedup_pd(values); // real real + // movehdup_pd does not exist... + auto imag = _mm512_permute_pd(values, 0xff); // imag imag + return Sleef_hypotd8_u05(real, imag); // abs abs + } + Vectorized> abs() const { + const __m512d real_mask = _mm512_castsi512_pd(_mm512_setr_epi64( + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000)); + return _mm512_and_pd(abs_(), real_mask); // abs 0 + } + __m512d angle_() const { + // angle = atan2(b/a) + auto b_a = _mm512_permute_pd(values, 0x55); // b a + return Sleef_atan2d8_u10(values, b_a); // 90-angle angle + } + Vectorized> angle() const { + const __m512d real_mask = _mm512_castsi512_pd(_mm512_setr_epi64( + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000)); + auto angle = _mm512_permute_pd(angle_(), 0x55); // angle 90-angle + return _mm512_and_pd(angle, real_mask); // angle 0 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } Vectorized> sgn() const { auto abs = abs_(); @@ -208,16 +422,29 @@ template <> class Vectorized> { return _mm512_mask_blend_pd(mask, div, zero); } __m512d real_() const { +<<<<<<< HEAD const __m512d real_mask = _mm512_castsi512_pd(_mm512_setr_epi64(0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000)); +======= + const __m512d real_mask = _mm512_castsi512_pd(_mm512_setr_epi64( + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000)); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return _mm512_and_pd(values, real_mask); } Vectorized> real() const { return real_(); } __m512d imag_() const { +<<<<<<< HEAD const __m512d imag_mask = _mm512_castsi512_pd(_mm512_setr_epi64(0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, @@ -230,12 +457,37 @@ template <> class Vectorized> { __m512d conj_() const { const __m512d sign_mask = _mm512_setr_pd(0.0, -0.0, 0.0, -0.0, 0.0, -0.0, 0.0, -0.0); return _mm512_xor_pd(values, sign_mask); // a -b +======= + const __m512d imag_mask = _mm512_castsi512_pd(_mm512_setr_epi64( + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF)); + return _mm512_and_pd(values, imag_mask); + } + Vectorized> imag() const { + return _mm512_permute_pd(imag_(), 0x55); // b a + } + __m512d conj_() const { + const __m512d sign_mask = + _mm512_setr_pd(0.0, -0.0, 0.0, -0.0, 0.0, -0.0, 0.0, -0.0); + return _mm512_xor_pd(values, sign_mask); // a -b +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } Vectorized> conj() const { return conj_(); } Vectorized> log() const { +<<<<<<< HEAD // Most trigonomic ops use the log() op to improve complex number performance. +======= + // Most trigonomic ops use the log() op to improve complex number + // performance. +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return map(std::log); } Vectorized> log2() const { @@ -250,7 +502,12 @@ template <> class Vectorized> { return map(std::log1p); } Vectorized> asin() const { +<<<<<<< HEAD // TODO: The vectorized implementation requires special handling for the case where real number/imag number is 0/Inf/NaN. +======= + // TODO: The vectorized implementation requires special handling for the + // case where real number/imag number is 0/Inf/NaN. +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // // asin(x) // // = -i*ln(iz + sqrt(1 -z^2)) // // = -i*ln((ai - b) + sqrt(1 - (a + bi)*(a + bi))) @@ -258,6 +515,7 @@ template <> class Vectorized> { // const __m512d one = _mm512_set1_pd(1); // auto conj = conj_(); +<<<<<<< HEAD // auto b_a = _mm512_permute_pd(conj, 0x55); //-b a // auto ab = _mm512_mul_pd(conj, b_a); //-ab -ab // auto im = _mm512_add_pd(ab, ab); //-2ab -2ab @@ -269,12 +527,31 @@ template <> class Vectorized> { // auto root = Vectorized(_mm512_mask_blend_pd(0xAA, re, im)).sqrt(); //sqrt(re + i*im) // auto ln = Vectorized(_mm512_add_pd(b_a, root)).log(); //ln(iz + sqrt()) // return Vectorized(_mm512_permute_pd(ln.values, 0x55)).conj(); //-i*ln() +======= + // auto b_a = _mm512_permute_pd(conj, 0x55); //-b a + // auto ab = _mm512_mul_pd(conj, b_a); //-ab + // -ab auto im = _mm512_add_pd(ab, ab); //-2ab -2ab + + // auto val_2 = _mm512_mul_pd(values, values); // a*a + // b*b auto re = hsub_pd(val_2, _mm512_permute_pd(val_2, 0x55)); // a*a-b*b + // b*b-a*a re = _mm512_sub_pd(one, re); + + // auto root = Vectorized(_mm512_mask_blend_pd(0xAA, re, im)).sqrt(); + // //sqrt(re + i*im) auto ln = Vectorized(_mm512_add_pd(b_a, root)).log(); + // //ln(iz + sqrt()) return Vectorized(_mm512_permute_pd(ln.values, + // 0x55)).conj(); //-i*ln() +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return map(std::asin); } Vectorized> acos() const { // acos(x) = pi/2 - asin(x) constexpr auto pi_2d = c10::pi / 2; +<<<<<<< HEAD const __m512d pi_2 = _mm512_setr_pd(pi_2d, 0.0, pi_2d, 0.0, pi_2d, 0.0, pi_2d, 0.0); +======= + const __m512d pi_2 = + _mm512_setr_pd(pi_2d, 0.0, pi_2d, 0.0, pi_2d, 0.0, pi_2d, 0.0); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return _mm512_sub_pd(pi_2, asin()); } Vectorized> atan() const; @@ -282,6 +559,7 @@ template <> class Vectorized> { return map(std::atanh); } Vectorized> exp() const { +<<<<<<< HEAD // TODO: The vectorized implementation requires special handling for the case where real number/imag number is 0/Inf/NaN. // //exp(a + bi) // // = exp(a)*(cos(b) + sin(b)i) @@ -291,13 +569,33 @@ template <> class Vectorized> { // auto sin_cos = Sleef_sincosd8_u10(values); //[sin(a), cos(a)] [sin(b), cos(b)] // auto cos_sin = _mm512_mask_blend_pd(0xAA, _mm512_permute_pd(sin_cos.y, 0x55), // sin_cos.x); //cos(b) sin(b) +======= + // TODO: The vectorized implementation requires special handling for the + // case where real number/imag number is 0/Inf/NaN. + // //exp(a + bi) + // // = exp(a)*(cos(b) + sin(b)i) + // auto exp = Sleef_expd8_u10(values); //exp(a) exp(b) exp = + // _mm512_mask_blend_pd(0xAA, exp, _mm512_permute_pd(exp, 0x55)); //exp(a) + // exp(a) + + // auto sin_cos = Sleef_sincosd8_u10(values); //[sin(a), cos(a)] [sin(b), + // cos(b)] auto cos_sin = _mm512_mask_blend_pd(0xAA, + // _mm512_permute_pd(sin_cos.y, 0x55), + // sin_cos.x); //cos(b) + // sin(b) +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // return _mm512_mul_pd(exp, cos_sin); return map(std::exp); } Vectorized> exp2() const { // Use identity 2**x = exp(log(2) * x) const __m512d ln_2 = _mm512_set1_pd(c10::ln_2); +<<<<<<< HEAD Vectorized> scaled_values = _mm512_mul_pd(values, ln_2); +======= + Vectorized> scaled_values = + _mm512_mul_pd(values, ln_2); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return scaled_values.exp(); } Vectorized> expm1() const { @@ -326,7 +624,12 @@ template <> class Vectorized> { return _mm512_sub_pd(zero, values); } Vectorized> round() const { +<<<<<<< HEAD return _mm512_roundscale_pd(values, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)); +======= + return _mm512_roundscale_pd( + values, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } Vectorized> tan() const { return map(std::tan); @@ -335,7 +638,12 @@ template <> class Vectorized> { return map(std::tanh); } Vectorized> trunc() const { +<<<<<<< HEAD return _mm512_roundscale_pd(values, (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)); +======= + return _mm512_roundscale_pd( + values, (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } Vectorized> sqrt() const { return map(std::sqrt); @@ -344,7 +652,12 @@ template <> class Vectorized> { Vectorized> rsqrt() const { return sqrt().reciprocal(); } +<<<<<<< HEAD Vectorized> pow(const Vectorized> &exp) const { +======= + Vectorized> pow( + const Vectorized>& exp) const { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) __at_align__ c10::complex x_tmp[size()]; __at_align__ c10::complex y_tmp[size()]; store(x_tmp); @@ -357,6 +670,7 @@ template <> class Vectorized> { // Comparison using the _CMP_**_OQ predicate. // `O`: get false if an operand is NaN // `Q`: do not raise if an operand is NaN +<<<<<<< HEAD Vectorized> operator==(const Vectorized>& other) const { auto mask = _mm512_cmp_pd_mask(values, other.values, _CMP_EQ_OQ); return _mm512_castsi512_pd(_mm512_mask_set1_epi64(zero_vector, mask, @@ -411,10 +725,86 @@ template <> Vectorized> inline operator*(const Vectorized Vectorized> inline operator/(const Vectorized> &a, const Vectorized> &b) { // TODO: The vectorized implementation requires special handling for the case where real number/imag number is 0/Inf/NaN. +======= + Vectorized> operator==( + const Vectorized>& other) const { + auto mask = _mm512_cmp_pd_mask(values, other.values, _CMP_EQ_OQ); + return _mm512_castsi512_pd( + _mm512_mask_set1_epi64(zero_vector, mask, 0xFFFFFFFFFFFFFFFF)); + } + Vectorized> operator!=( + const Vectorized>& other) const { + auto mask = _mm512_cmp_pd_mask(values, other.values, _CMP_NEQ_UQ); + return _mm512_castsi512_pd( + _mm512_mask_set1_epi64(zero_vector, mask, 0xFFFFFFFFFFFFFFFF)); + } + Vectorized> operator<( + const Vectorized>& other [[maybe_unused]]) const { + TORCH_CHECK(false, "not supported for complex numbers"); + } + Vectorized> operator<=( + const Vectorized>& other [[maybe_unused]]) const { + TORCH_CHECK(false, "not supported for complex numbers"); + } + Vectorized> operator>( + const Vectorized>& other [[maybe_unused]]) const { + TORCH_CHECK(false, "not supported for complex numbers"); + } + Vectorized> operator>=( + const Vectorized>& other [[maybe_unused]]) const { + TORCH_CHECK(false, "not supported for complex numbers"); + } + + Vectorized> eq( + const Vectorized>& other) const; + Vectorized> ne( + const Vectorized>& other) const; +}; + +template <> +Vectorized> inline operator+( + const Vectorized>& a, + const Vectorized>& b) { + return _mm512_add_pd(a, b); +} + +template <> +Vectorized> inline operator-( + const Vectorized>& a, + const Vectorized>& b) { + return _mm512_sub_pd(a, b); +} + +template <> +Vectorized> inline operator*( + const Vectorized>& a, + const Vectorized>& b) { + //(a + bi) * (c + di) = (ac - bd) + (ad + bc)i + const __m512d sign_mask = + _mm512_setr_pd(0.0, -0.0, 0.0, -0.0, 0.0, -0.0, 0.0, -0.0); + auto ac_bd = _mm512_mul_pd(a, b); // ac bd + + auto d_c = _mm512_permute_pd(b, 0x55); // d c + d_c = _mm512_xor_pd(sign_mask, d_c); // d -c + auto ad_bc = _mm512_mul_pd(a, d_c); // ad -bc + + auto ret = Vectorized>::hsub_pd( + ac_bd, ad_bc); // ac - bd ad + bc + return ret; +} + +template <> +Vectorized> inline operator/( + const Vectorized>& a, + const Vectorized>& b) { + // TODO: The vectorized implementation requires special handling for the case + // where real number/imag number is 0/Inf/NaN. +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // //re + im*i = (a + bi) / (c + di) // auto mask = _mm512_set1_pd(-0.f); // auto fabs_cd = _mm512_andnot_pd(mask, b); // |c| |d| // auto fabs_dc = _mm512_permute_pd(fabs_cd, 0x55); // |d| |c| +<<<<<<< HEAD // auto scale = _mm512_rcp14_pd(_mm512_max_pd(fabs_cd, fabs_dc)); // 1/sc 1/sc // auto a2 = _mm512_mul_pd(a, scale); // a/sc b/sc // auto b2 = _mm512_mul_pd(b, scale); // c/sc d/sc @@ -433,6 +823,30 @@ template <> Vectorized> inline operator/(const Vectorized tmp1[Vectorized>::size()]; __at_align__ c10::complex tmp2[Vectorized>::size()]; __at_align__ c10::complex out[Vectorized>::size()]; +======= + // auto scale = _mm512_rcp14_pd(_mm512_max_pd(fabs_cd, fabs_dc)); // 1/sc + // 1/sc auto a2 = _mm512_mul_pd(a, scale); // a/sc b/sc auto b2 = + // _mm512_mul_pd(b, scale); // c/sc d/sc auto acbd2 = + // _mm512_mul_pd(a2, b2); + + // const __m512d sign_mask = _mm512_setr_pd(-0.0, 0.0, -0.0, 0.0, -0.0, 0.0, + // -0.0, 0.0); auto dc2 = _mm512_permute_pd(b2, 0x55); // d/sc c/sc + // dc2 = _mm512_xor_pd(sign_mask, dc2); // -d/|c,d| c/sc + // auto adbc2 = _mm512_mul_pd(a2, dc2); //-ad/sc^2 bc/sc^2 + // auto res2 = Vectorized>::hadd_pd(acbd2, adbc2); + // //(ac+bd)/sc^2 (bc-ad)/sc^2 + + // // get the denominator + // auto denom2 = Vectorized>(b2).abs_2_(); // + // (c^2+d^2)/sc^2 (c^2+d^2)/sc^2 res2 = _mm512_div_pd(res2, denom2); return + // res2; + __at_align__ c10::complex + tmp1[Vectorized>::size()]; + __at_align__ c10::complex + tmp2[Vectorized>::size()]; + __at_align__ c10::complex + out[Vectorized>::size()]; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) a.store(tmp1); b.store(tmp2); for (const auto i : c10::irange(Vectorized>::size())) { @@ -442,6 +856,7 @@ template <> Vectorized> inline operator/(const Vectorized> Vectorized>::reciprocal() const{ // TODO: The vectorized implementation requires special handling for the case where real number/imag number is 0/Inf/NaN. // //re + im*i = (a + bi) / (c + di) @@ -449,6 +864,17 @@ inline Vectorized> Vectorized>::recipr // //im = (bc - ad)/abs_2() = d/abs_2() // const __m512d sign_mask = _mm512_setr_pd(0.0, -0.0, 0.0, -0.0, 0.0, -0.0, 0.0, -0.0); // auto c_d = _mm512_xor_pd(sign_mask, values); //c -d +======= +inline Vectorized> Vectorized< + c10::complex>::reciprocal() const { + // TODO: The vectorized implementation requires special handling for the case + // where real number/imag number is 0/Inf/NaN. + // //re + im*i = (a + bi) / (c + di) + // //re = (ac + bd)/abs_2() = c/abs_2() + // //im = (bc - ad)/abs_2() = d/abs_2() + // const __m512d sign_mask = _mm512_setr_pd(0.0, -0.0, 0.0, -0.0, 0.0, -0.0, + // 0.0, -0.0); auto c_d = _mm512_xor_pd(sign_mask, values); //c -d +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // return _mm512_div_pd(c_d, abs_2_()); __at_align__ c10::complex tmp[size()]; store(tmp); @@ -458,6 +884,7 @@ inline Vectorized> Vectorized>::recipr return loadu(tmp); } +<<<<<<< HEAD inline Vectorized> Vectorized>::atan() const { // TODO: The vectorized implementation requires special handling for the case where real number/imag number is 0/Inf/NaN. // // atan(x) = i/2 * ln((i + z)/(i - z)) @@ -468,12 +895,33 @@ inline Vectorized> Vectorized>::atan() // auto sub = Vectorized(_mm512_sub_pd(i, values)); // -a 1-b // auto ln = (sum/sub).log(); // ln((i + z)/(i - z)) // return i_half*ln; // i/2*ln() +======= +inline Vectorized> Vectorized>::atan() + const { + // TODO: The vectorized implementation requires special handling for the case + // where real number/imag number is 0/Inf/NaN. + // // atan(x) = i/2 * ln((i + z)/(i - z)) + // const __m512d i = _mm512_setr_pd(0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0); + // const Vectorized i_half = _mm512_setr_pd(0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, + // 0.5); + + // auto sum = Vectorized(_mm512_add_pd(i, values)); // a + // 1+b auto sub = Vectorized(_mm512_sub_pd(i, values)); // -a 1-b auto + // ln = (sum/sub).log(); // ln((i + + // z)/(i - z)) return i_half*ln; // i/2*ln() +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return map(std::atan); } template <> +<<<<<<< HEAD Vectorized> inline maximum(const Vectorized>& a, const Vectorized>& b) { +======= +Vectorized> inline maximum( + const Vectorized>& a, + const Vectorized>& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) auto zero_vec = _mm512_set1_epi64(0); auto abs_a = a.abs_2_(); auto abs_b = b.abs_2_(); @@ -481,14 +929,24 @@ Vectorized> inline maximum(const Vectorized>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return _mm512_or_pd(max, _mm512_castsi512_pd(isnan)); } template <> +<<<<<<< HEAD Vectorized> inline minimum(const Vectorized>& a, const Vectorized>& b) { +======= +Vectorized> inline minimum( + const Vectorized>& a, + const Vectorized>& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) auto zero_vec = _mm512_set1_epi64(0); auto abs_a = a.abs_2_(); auto abs_b = b.abs_2_(); @@ -496,24 +954,41 @@ Vectorized> inline minimum(const Vectorized>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return _mm512_or_pd(min, _mm512_castsi512_pd(isnan)); } template <> +<<<<<<< HEAD Vectorized> inline operator&(const Vectorized>& a, const Vectorized>& b) { +======= +Vectorized> inline operator&( + const Vectorized>& a, + const Vectorized>& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return _mm512_and_pd(a, b); } template <> +<<<<<<< HEAD Vectorized> inline operator|(const Vectorized>& a, const Vectorized>& b) { +======= +Vectorized> inline operator|( + const Vectorized>& a, + const Vectorized>& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return _mm512_or_pd(a, b); } template <> +<<<<<<< HEAD Vectorized> inline operator^(const Vectorized>& a, const Vectorized>& b) { return _mm512_xor_pd(a, b); @@ -529,8 +1004,37 @@ inline Vectorized> Vectorized>::ne(con auto ne = (*this != other); // compares real and imag individually // If either real numbers or imag numbers are not equal, then the complex numbers are not equal return (ne.real() | ne.imag()) & Vectorized>(_mm512_set1_pd(1.0)); +======= +Vectorized> inline operator^( + const Vectorized>& a, + const Vectorized>& b) { + return _mm512_xor_pd(a, b); +} + +inline Vectorized> Vectorized>::eq( + const Vectorized>& other) const { + auto eq = (*this == other); // compares real and imag individually + // If both real numbers and imag numbers are equal, then the complex numbers + // are equal + return (eq.real() & eq.imag()) & + Vectorized>(_mm512_set1_pd(1.0)); +} + +inline Vectorized> Vectorized>::ne( + const Vectorized>& other) const { + auto ne = (*this != other); // compares real and imag individually + // If either real numbers or imag numbers are not equal, then the complex + // numbers are not equal + return (ne.real() | ne.imag()) & + Vectorized>(_mm512_set1_pd(1.0)); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } #endif +<<<<<<< HEAD }} +======= +} // namespace CPU_CAPABILITY +} // namespace at::vec +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_complex_float.h b/aten/src/ATen/cpu/vec/vec512/vec512_complex_float.h index 4b07fb3af863..7b2a00cfce8a 100644 --- a/aten/src/ATen/cpu/vec/vec512/vec512_complex_float.h +++ b/aten/src/ATen/cpu/vec/vec512/vec512_complex_float.h @@ -3,27 +3,51 @@ // DO NOT DEFINE STATIC DATA IN THIS HEADER! // See Note [Do not compile initializers with AVX] +<<<<<<< HEAD #include #include #include #include +======= +#include +#include +#include +#include +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #if defined(CPU_CAPABILITY_AVX512) #define SLEEF_STATIC_LIBS #include #endif +<<<<<<< HEAD +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) namespace at::vec { // See Note [CPU_CAPABILITY namespace] inline namespace CPU_CAPABILITY { #if defined(CPU_CAPABILITY_AVX512) +<<<<<<< HEAD template <> class Vectorized> { private: __m512 values; static constexpr __m512i zero_vector {0, 0, 0, 0, 0, 0, 0, 0}; public: +======= +template <> +struct is_vec_specialized_for> : std::bool_constant { +}; + +template <> +class Vectorized> { + private: + __m512 values; + static constexpr __m512i zero_vector{0, 0, 0, 0, 0, 0, 0, 0}; + + public: +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) using value_type = c10::complex; using size_type = int; static constexpr size_type size() { @@ -34,6 +58,7 @@ template <> class Vectorized> { Vectorized(c10::complex val) { float real_value = val.real(); float imag_value = val.imag(); +<<<<<<< HEAD values = _mm512_setr_ps(real_value, imag_value, real_value, imag_value, real_value, imag_value, @@ -55,13 +80,65 @@ template <> class Vectorized> { val6.real(), val6.imag(), val7.real(), val7.imag(), val8.real(), val8.imag()); +======= + values = _mm512_setr_ps( + real_value, + imag_value, + real_value, + imag_value, + real_value, + imag_value, + real_value, + imag_value, + real_value, + imag_value, + real_value, + imag_value, + real_value, + imag_value, + real_value, + imag_value); + } + Vectorized( + c10::complex val1, + c10::complex val2, + c10::complex val3, + c10::complex val4, + c10::complex val5, + c10::complex val6, + c10::complex val7, + c10::complex val8) { + values = _mm512_setr_ps( + val1.real(), + val1.imag(), + val2.real(), + val2.imag(), + val3.real(), + val3.imag(), + val4.real(), + val4.imag(), + val5.real(), + val5.imag(), + val6.real(), + val6.imag(), + val7.real(), + val7.imag(), + val8.real(), + val8.imag()); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } operator __m512() const { return values; } template +<<<<<<< HEAD static Vectorized> blend(const Vectorized>& a, const Vectorized>& b) { +======= + static Vectorized> blend( + const Vectorized>& a, + const Vectorized>& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // convert c10::complex index mask to V index mask: xy -> xxyy static_assert(mask > -1 && mask < 256, "Unexpected mask value"); // The compiler would hopefully convert this switch condition @@ -577,6 +654,7 @@ template <> class Vectorized> { return _mm512_mask_blend_ps(0xFFF3, a.values, b.values); case 254: return _mm512_mask_blend_ps(0xFFFC, a.values, b.values); +<<<<<<< HEAD default: break; } return b; @@ -605,6 +683,42 @@ template <> class Vectorized> { static Vectorized> set(const Vectorized>& a, const Vectorized>& b, int64_t count = size()) { +======= + default: + break; + } + return b; + } + static Vectorized> blendv( + const Vectorized>& a, + const Vectorized>& b, + const Vectorized>& mask) { + // convert c10::complex index mask to V index mask: xy -> xxyy + auto mask_ = _mm512_unpacklo_ps(mask.values, mask.values); + auto all_ones = _mm512_set1_epi32(0xFFFFFFFF); + auto mmask = _mm512_cmp_epi32_mask( + _mm512_castps_si512(mask_), all_ones, _MM_CMPINT_EQ); + return _mm512_mask_blend_ps(mmask, a.values, b.values); + } + template + static Vectorized> arange( + c10::complex base = 0., + step_t step = static_cast(1)) { + return Vectorized>( + base, + base + step, + base + c10::complex(2) * step, + base + c10::complex(3) * step, + base + c10::complex(4) * step, + base + c10::complex(5) * step, + base + c10::complex(6) * step, + base + c10::complex(7) * step); + } + static Vectorized> set( + const Vectorized>& a, + const Vectorized>& b, + int64_t count = size()) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) switch (count) { case 0: return a; @@ -625,6 +739,7 @@ template <> class Vectorized> { } return b; } +<<<<<<< HEAD static Vectorized> loadu(const void* ptr, int64_t count = size()) { if (count == size()) return _mm512_loadu_ps(reinterpret_cast(ptr)); @@ -634,6 +749,20 @@ template <> class Vectorized> { // for more details. We do not initialize arrays to zero using "={0}" because gcc would compile it to two // instructions while a loop would be compiled to one instruction. for (const auto i : c10::irange(2*size())) { +======= + static Vectorized> loadu( + const void* ptr, + int64_t count = size()) { + if (count == size()) + return _mm512_loadu_ps(reinterpret_cast(ptr)); + + __at_align__ float tmp_values[2 * size()]; + // Ensure uninitialized memory does not change the output value See + // https://github.com/pytorch/pytorch/issues/32502 for more details. We do + // not initialize arrays to zero using "={0}" because gcc would compile it + // to two instructions while a loop would be compiled to one instruction. + for (const auto i : c10::irange(2 * size())) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) tmp_values[i] = 0.0; } std::memcpy( @@ -646,7 +775,11 @@ template <> class Vectorized> { if (count == size()) { _mm512_storeu_ps(reinterpret_cast(ptr), values); } else if (count > 0) { +<<<<<<< HEAD float tmp_values[2*size()]; +======= + float tmp_values[2 * size()]; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) _mm512_storeu_ps(reinterpret_cast(tmp_values), values); std::memcpy(ptr, tmp_values, count * sizeof(c10::complex)); } @@ -654,6 +787,7 @@ template <> class Vectorized> { // AVX512 doesn't have horizontal add & horizontal sub instructions. // TODO: hadd_pd() & hsub_pd() may have scope for improvement. static inline __m512 hadd_ps(__m512 a, __m512 b) { +<<<<<<< HEAD __m512i idx1 = _mm512_set_epi32(30, 14, 28, 12, 26, 10, 24, 8, 22, 6, 20, 4, 18, 2, 16, 0); __m512i idx2 = _mm512_set_epi32(31, 15, 29, 13, 27, 11, 25, 9, 23, 7, 21, 5, 19, 3, 17, 1); return _mm512_add_ps(_mm512_mask_permutex2var_ps(a, 0xffff, idx1, b), @@ -668,6 +802,29 @@ template <> class Vectorized> { const c10::complex& operator[](int idx) const = delete; c10::complex& operator[](int idx) = delete; Vectorized> map(c10::complex (*const f)(const c10::complex &)) const { +======= + __m512i idx1 = _mm512_set_epi32( + 30, 14, 28, 12, 26, 10, 24, 8, 22, 6, 20, 4, 18, 2, 16, 0); + __m512i idx2 = _mm512_set_epi32( + 31, 15, 29, 13, 27, 11, 25, 9, 23, 7, 21, 5, 19, 3, 17, 1); + return _mm512_add_ps( + _mm512_mask_permutex2var_ps(a, 0xffff, idx1, b), + _mm512_mask_permutex2var_ps(a, 0xffff, idx2, b)); + } + static inline __m512 hsub_ps(__m512 a, __m512 b) { + __m512i idx1 = _mm512_set_epi32( + 30, 14, 28, 12, 26, 10, 24, 8, 22, 6, 20, 4, 18, 2, 16, 0); + __m512i idx2 = _mm512_set_epi32( + 31, 15, 29, 13, 27, 11, 25, 9, 23, 7, 21, 5, 19, 3, 17, 1); + return _mm512_sub_ps( + _mm512_mask_permutex2var_ps(a, 0xffff, idx1, b), + _mm512_mask_permutex2var_ps(a, 0xffff, idx2, b)); + } + const c10::complex& operator[](int idx) const = delete; + c10::complex& operator[](int idx) = delete; + Vectorized> map( + c10::complex (*const f)(const c10::complex&)) const { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) __at_align__ c10::complex tmp[size()]; store(tmp); for (const auto i : c10::irange(size())) { @@ -676,6 +833,7 @@ template <> class Vectorized> { return loadu(tmp); } __m512 abs_2_() const { +<<<<<<< HEAD auto val_2 = _mm512_mul_ps(values, values); // a*a b*b auto ret = hadd_ps(val_2, val_2); // a*a+b*b a*a+b*b return ret; @@ -704,6 +862,62 @@ template <> class Vectorized> { 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000)); auto angle = _mm512_permute_ps(angle_(), 0xB1); // angle 90-angle return _mm512_and_ps(angle, real_mask); // angle 0 +======= + auto val_2 = _mm512_mul_ps(values, values); // a*a b*b + auto ret = hadd_ps(val_2, val_2); // a*a+b*b a*a+b*b + return ret; + } + __m512 abs_() const { + auto real = _mm512_moveldup_ps(values); // real real + auto imag = _mm512_movehdup_ps(values); // imag imag + return Sleef_hypotf16_u05(real, imag); // abs abs + } + Vectorized> abs() const { + const __m512 real_mask = _mm512_castsi512_ps(_mm512_setr_epi32( + 0xFFFFFFFF, + 0x00000000, + 0xFFFFFFFF, + 0x00000000, + 0xFFFFFFFF, + 0x00000000, + 0xFFFFFFFF, + 0x00000000, + 0xFFFFFFFF, + 0x00000000, + 0xFFFFFFFF, + 0x00000000, + 0xFFFFFFFF, + 0x00000000, + 0xFFFFFFFF, + 0x00000000)); + return _mm512_and_ps(abs_(), real_mask); // abs 0 + } + __m512 angle_() const { + // angle = atan2(b/a) + auto b_a = _mm512_permute_ps(values, 0xB1); // b a + return Sleef_atan2f16_u10(values, b_a); // 90-angle angle + } + Vectorized> angle() const { + const __m512 real_mask = _mm512_castsi512_ps(_mm512_setr_epi32( + 0xFFFFFFFF, + 0x00000000, + 0xFFFFFFFF, + 0x00000000, + 0xFFFFFFFF, + 0x00000000, + 0xFFFFFFFF, + 0x00000000, + 0xFFFFFFFF, + 0x00000000, + 0xFFFFFFFF, + 0x00000000, + 0xFFFFFFFF, + 0x00000000, + 0xFFFFFFFF, + 0x00000000)); + auto angle = _mm512_permute_ps(angle_(), 0xB1); // angle 90-angle + return _mm512_and_ps(angle, real_mask); // angle 0 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } Vectorized> sgn() const { auto abs = abs_(); @@ -713,16 +927,37 @@ template <> class Vectorized> { return _mm512_mask_blend_ps(mask, div, zero); } __m512 real_() const { +<<<<<<< HEAD const __m512 real_mask = _mm512_castsi512_ps(_mm512_setr_epi32(0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000)); +======= + const __m512 real_mask = _mm512_castsi512_ps(_mm512_setr_epi32( + 0xFFFFFFFF, + 0x00000000, + 0xFFFFFFFF, + 0x00000000, + 0xFFFFFFFF, + 0x00000000, + 0xFFFFFFFF, + 0x00000000, + 0xFFFFFFFF, + 0x00000000, + 0xFFFFFFFF, + 0x00000000, + 0xFFFFFFFF, + 0x00000000, + 0xFFFFFFFF, + 0x00000000)); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return _mm512_and_ps(values, real_mask); } Vectorized> real() const { return real_(); } __m512 imag_() const { +<<<<<<< HEAD const __m512 imag_mask = _mm512_castsi512_ps(_mm512_setr_epi32(0x00000000, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, @@ -736,12 +971,60 @@ template <> class Vectorized> { const __m512 sign_mask = _mm512_setr_ps(0.0, -0.0, 0.0, -0.0, 0.0, -0.0, 0.0, -0.0, 0.0, -0.0, 0.0, -0.0, 0.0, -0.0, 0.0, -0.0); return _mm512_xor_ps(values, sign_mask); // a -b +======= + const __m512 imag_mask = _mm512_castsi512_ps(_mm512_setr_epi32( + 0x00000000, + 0xFFFFFFFF, + 0x00000000, + 0xFFFFFFFF, + 0x00000000, + 0xFFFFFFFF, + 0x00000000, + 0xFFFFFFFF, + 0x00000000, + 0xFFFFFFFF, + 0x00000000, + 0xFFFFFFFF, + 0x00000000, + 0xFFFFFFFF, + 0x00000000, + 0xFFFFFFFF)); + return _mm512_and_ps(values, imag_mask); + } + Vectorized> imag() const { + return _mm512_permute_ps(imag_(), 0xB1); // b a + } + __m512 conj_() const { + const __m512 sign_mask = _mm512_setr_ps( + 0.0, + -0.0, + 0.0, + -0.0, + 0.0, + -0.0, + 0.0, + -0.0, + 0.0, + -0.0, + 0.0, + -0.0, + 0.0, + -0.0, + 0.0, + -0.0); + return _mm512_xor_ps(values, sign_mask); // a -b +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } Vectorized> conj() const { return conj_(); } Vectorized> log() const { +<<<<<<< HEAD // Most trigonomic ops use the log() op to improve complex number performance. +======= + // Most trigonomic ops use the log() op to improve complex number + // performance. +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return map(std::log); } Vectorized> log2() const { @@ -756,7 +1039,12 @@ template <> class Vectorized> { return map(std::log1p); } Vectorized> asin() const { +<<<<<<< HEAD // TODO: The vectorized implementation requires special handling for the case where real number/imag number is 0/Inf/NaN. +======= + // TODO: The vectorized implementation requires special handling for the + // case where real number/imag number is 0/Inf/NaN. +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // // asin(x) // // = -i*ln(iz + sqrt(1 -z^2)) // // = -i*ln((ai - b) + sqrt(1 - (a + bi)*(a + bi))) @@ -764,6 +1052,7 @@ template <> class Vectorized> { // const __m512 one = _mm512_set1_ps(1); // auto conj = conj_(); +<<<<<<< HEAD // auto b_a = _mm512_permute_ps(conj, 0xB1); //-b a // auto ab = _mm512_mul_ps(conj, b_a); //-ab -ab // auto im = _mm512_add_ps(ab, ab); //-2ab -2ab @@ -775,6 +1064,20 @@ template <> class Vectorized> { // auto root = Vectorized(_mm512_mask_blend_ps(0xAAAA, re, im)).sqrt(); //sqrt(re + i*im) // auto ln = Vectorized(_mm512_add_ps(b_a, root)).log(); //ln(iz + sqrt()) // return Vectorized(_mm512_permute_ps(ln.values, 0xB1)).conj(); //-i*ln() +======= + // auto b_a = _mm512_permute_ps(conj, 0xB1); //-b a + // auto ab = _mm512_mul_ps(conj, b_a); //-ab + // -ab auto im = _mm512_add_ps(ab, ab); //-2ab -2ab + + // auto val_2 = _mm512_mul_ps(values, values); // a*a + // b*b auto re = hsub_ps(val_2, _mm512_permute_ps(val_2, 0xB1)); // a*a-b*b + // b*b-a*a re = _mm512_sub_ps(one, re); + + // auto root = Vectorized(_mm512_mask_blend_ps(0xAAAA, re, im)).sqrt(); + // //sqrt(re + i*im) auto ln = Vectorized(_mm512_add_ps(b_a, root)).log(); + // //ln(iz + sqrt()) return Vectorized(_mm512_permute_ps(ln.values, + // 0xB1)).conj(); //-i*ln() +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return map(std::asin); } Vectorized> acos() const { @@ -785,6 +1088,7 @@ template <> class Vectorized> { return map(std::atanh); } Vectorized> exp() const { +<<<<<<< HEAD // TODO: The vectorized implementation requires special handling for the case where real number/imag number is 0/Inf/NaN. // //exp(a + bi) // // = exp(a)*(cos(b) + sin(b)i) @@ -794,6 +1098,21 @@ template <> class Vectorized> { // auto sin_cos = Sleef_sincosf16_u10(values); //[sin(a), cos(a)] [sin(b), cos(b)] // auto cos_sin = _mm512_mask_blend_ps(0xAAAA, _mm512_permute_ps(sin_cos.y, 0xB1), // sin_cos.x); //cos(b) sin(b) +======= + // TODO: The vectorized implementation requires special handling for the + // case where real number/imag number is 0/Inf/NaN. + // //exp(a + bi) + // // = exp(a)*(cos(b) + sin(b)i) + // auto exp = Sleef_expf16_u10(values); //exp(a) exp(b) exp = + // _mm512_mask_blend_ps(0xAAAA, exp, _mm512_permute_ps(exp, 0xB1)); //exp(a) + // exp(a) + + // auto sin_cos = Sleef_sincosf16_u10(values); //[sin(a), cos(a)] [sin(b), + // cos(b)] auto cos_sin = _mm512_mask_blend_ps(0xAAAA, + // _mm512_permute_ps(sin_cos.y, 0xB1), + // sin_cos.x); //cos(b) + // sin(b) +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // return _mm512_mul_ps(exp, cos_sin); return map(std::exp); } @@ -829,7 +1148,12 @@ template <> class Vectorized> { return _mm512_sub_ps(zero, values); } Vectorized> round() const { +<<<<<<< HEAD return _mm512_roundscale_ps(values, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)); +======= + return _mm512_roundscale_ps( + values, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } Vectorized> tan() const { return map(std::tan); @@ -838,7 +1162,12 @@ template <> class Vectorized> { return map(std::tanh); } Vectorized> trunc() const { +<<<<<<< HEAD return _mm512_roundscale_ps(values, (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)); +======= + return _mm512_roundscale_ps( + values, (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } Vectorized> sqrt() const { return map(std::sqrt); @@ -847,7 +1176,12 @@ template <> class Vectorized> { Vectorized> rsqrt() const { return sqrt().reciprocal(); } +<<<<<<< HEAD Vectorized> pow(const Vectorized> &exp) const { +======= + Vectorized> pow( + const Vectorized>& exp) const { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) __at_align__ c10::complex x_tmp[size()]; __at_align__ c10::complex y_tmp[size()]; store(x_tmp); @@ -860,6 +1194,7 @@ template <> class Vectorized> { // Comparison using the _CMP_**_OQ predicate. // `O`: get false if an operand is NaN // `Q`: do not raise if an operand is NaN +<<<<<<< HEAD Vectorized> operator==(const Vectorized>& other) const { auto mask = _mm512_cmp_ps_mask(values, other.values, _CMP_EQ_OQ); return _mm512_castsi512_ps(_mm512_mask_set1_epi32(zero_vector, mask, 0xFFFFFFFF)); @@ -913,10 +1248,101 @@ template <> Vectorized> inline operator*(const Vectorized Vectorized> inline operator/(const Vectorized> &a, const Vectorized> &b) { // TODO: The vectorized implementation requires special handling for the case where real number/imag number is 0/Inf/NaN. +======= + Vectorized> operator==( + const Vectorized>& other) const { + auto mask = _mm512_cmp_ps_mask(values, other.values, _CMP_EQ_OQ); + return _mm512_castsi512_ps( + _mm512_mask_set1_epi32(zero_vector, mask, 0xFFFFFFFF)); + } + Vectorized> operator!=( + const Vectorized>& other) const { + auto mask = _mm512_cmp_ps_mask(values, other.values, _CMP_NEQ_UQ); + return _mm512_castsi512_ps( + _mm512_mask_set1_epi32(zero_vector, mask, 0xFFFFFFFF)); + } + Vectorized> operator<( + const Vectorized>& other [[maybe_unused]]) const { + TORCH_CHECK(false, "not supported for complex numbers"); + } + Vectorized> operator<=( + const Vectorized>& other [[maybe_unused]]) const { + TORCH_CHECK(false, "not supported for complex numbers"); + } + Vectorized> operator>( + const Vectorized>& other [[maybe_unused]]) const { + TORCH_CHECK(false, "not supported for complex numbers"); + } + Vectorized> operator>=( + const Vectorized>& other [[maybe_unused]]) const { + TORCH_CHECK(false, "not supported for complex numbers"); + } + + Vectorized> eq( + const Vectorized>& other) const; + Vectorized> ne( + const Vectorized>& other) const; +}; + +template <> +Vectorized> inline operator+( + const Vectorized>& a, + const Vectorized>& b) { + return _mm512_add_ps(a, b); +} + +template <> +Vectorized> inline operator-( + const Vectorized>& a, + const Vectorized>& b) { + return _mm512_sub_ps(a, b); +} + +template <> +Vectorized> inline operator*( + const Vectorized>& a, + const Vectorized>& b) { + //(a + bi) * (c + di) = (ac - bd) + (ad + bc)i + const __m512 sign_mask = _mm512_setr_ps( + 0.0, + -0.0, + 0.0, + -0.0, + 0.0, + -0.0, + 0.0, + -0.0, + 0.0, + -0.0, + 0.0, + -0.0, + 0.0, + -0.0, + 0.0, + -0.0); + auto ac_bd = _mm512_mul_ps(a, b); // ac bd + + auto d_c = _mm512_permute_ps(b, 0xB1); // d c + d_c = _mm512_xor_ps(sign_mask, d_c); // d -c + auto ad_bc = _mm512_mul_ps(a, d_c); // ad -bc + + auto ret = Vectorized>::hsub_ps( + ac_bd, ad_bc); // ac - bd ad + bc + return ret; +} + +template <> +Vectorized> inline operator/( + const Vectorized>& a, + const Vectorized>& b) { + // TODO: The vectorized implementation requires special handling for the case + // where real number/imag number is 0/Inf/NaN. +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // //re + im*i = (a + bi) / (c + di) // auto mask = _mm512_set1_ps(-0.f); // auto fabs_cd = _mm512_andnot_ps(mask, b); // |c| |d| // auto fabs_dc = _mm512_permute_ps(fabs_cd, 0xB1); // |d| |c| +<<<<<<< HEAD // auto scale = _mm512_rcp14_ps(_mm512_max_ps(fabs_cd, fabs_dc)); // 1/sc 1/sc // auto a2 = _mm512_mul_ps(a, scale); // a/sc b/sc // auto b2 = _mm512_mul_ps(b, scale); // c/sc d/sc @@ -935,6 +1361,31 @@ template <> Vectorized> inline operator/(const Vectorized tmp1[Vectorized>::size()]; __at_align__ c10::complex tmp2[Vectorized>::size()]; +======= + // auto scale = _mm512_rcp14_ps(_mm512_max_ps(fabs_cd, fabs_dc)); // 1/sc + // 1/sc auto a2 = _mm512_mul_ps(a, scale); // a/sc b/sc auto b2 = + // _mm512_mul_ps(b, scale); // c/sc d/sc auto acbd2 = + // _mm512_mul_ps(a2, b2); + + // const __m512 sign_mask = _mm512_setr_ps(-0.0, 0.0, -0.0, 0.0, -0.0, 0.0, + // -0.0, 0.0, + // -0.0, 0.0, -0.0, 0.0, -0.0, 0.0, + // -0.0, 0.0); + // auto dc2 = _mm512_permute_ps(b2, 0xB1); // d/sc c/sc + // dc2 = _mm512_xor_ps(sign_mask, dc2); // -d/|c,d| c/sc + // auto adbc2 = _mm512_mul_ps(a2, dc2); //-ad/sc^2 bc/sc^2 + // auto res2 = Vectorized>::hadd_ps(acbd2, adbc2); + // //(ac+bd)/sc^2 (bc-ad)/sc^2 + + // // get the denominator + // auto denom2 = Vectorized>(b2).abs_2_(); // + // (c^2+d^2)/sc^2 (c^2+d^2)/sc^2 res2 = _mm512_div_ps(res2, denom2); return + // res2; + __at_align__ c10::complex + tmp1[Vectorized>::size()]; + __at_align__ c10::complex + tmp2[Vectorized>::size()]; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) __at_align__ c10::complex out[Vectorized>::size()]; a.store(tmp1); b.store(tmp2); @@ -945,6 +1396,7 @@ template <> Vectorized> inline operator/(const Vectorized> Vectorized>::reciprocal() const { // TODO: The vectorized implementation requires special handling for the case where real number/imag number is 0/Inf/NaN. // //re + im*i = (a + bi) / (c + di) @@ -952,6 +1404,19 @@ inline Vectorized> Vectorized>::reciproc // //im = (bc - ad)/abs_2() = d/abs_2() // const __m512 sign_mask = _mm512_setr_ps(0.0, -0.0, 0.0, -0.0, 0.0, -0.0, 0.0, -0.0, // 0.0, -0.0, 0.0, -0.0, 0.0, -0.0, 0.0, -0.0); +======= +inline Vectorized> Vectorized< + c10::complex>::reciprocal() const { + // TODO: The vectorized implementation requires special handling for the case + // where real number/imag number is 0/Inf/NaN. + // //re + im*i = (a + bi) / (c + di) + // //re = (ac + bd)/abs_2() = c/abs_2() + // //im = (bc - ad)/abs_2() = d/abs_2() + // const __m512 sign_mask = _mm512_setr_ps(0.0, -0.0, 0.0, -0.0, 0.0, -0.0, + // 0.0, -0.0, + // 0.0, -0.0, 0.0, -0.0, 0.0, -0.0, + // 0.0, -0.0); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // auto c_d = _mm512_xor_ps(sign_mask, values); //c -d // return _mm512_div_ps(c_d, abs_2_()); __at_align__ c10::complex tmp[size()]; @@ -962,6 +1427,7 @@ inline Vectorized> Vectorized>::reciproc return loadu(tmp); } +<<<<<<< HEAD inline Vectorized> Vectorized>::atan() const { // TODO: The vectorized implementation requires special handling for the case where real number/imag number is 0/Inf/NaN. // // atan(x) = i/2 * ln((i + z)/(i - z)) @@ -974,12 +1440,36 @@ inline Vectorized> Vectorized>::atan() c // auto sub = Vectorized(_mm512_sub_ps(i, values)); // -a 1-b // auto ln = (sum/sub).log(); // ln((i + z)/(i - z)) // return i_half*ln; // i/2*ln() +======= +inline Vectorized> Vectorized>::atan() + const { + // TODO: The vectorized implementation requires special handling for the case + // where real number/imag number is 0/Inf/NaN. + // // atan(x) = i/2 * ln((i + z)/(i - z)) + // const __m512 i = _mm512_setr_ps(0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, + // 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0); + // const Vectorized i_half = _mm512_setr_ps(0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, + // 0.5, + // 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, + // 0.5); + + // auto sum = Vectorized(_mm512_add_ps(i, values)); // a + // 1+b auto sub = Vectorized(_mm512_sub_ps(i, values)); // -a 1-b auto + // ln = (sum/sub).log(); // ln((i + + // z)/(i - z)) return i_half*ln; // i/2*ln() +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return map(std::atan); } template <> +<<<<<<< HEAD Vectorized> inline maximum(const Vectorized>& a, const Vectorized>& b) { +======= +Vectorized> inline maximum( + const Vectorized>& a, + const Vectorized>& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) auto zero_vector = _mm512_set1_epi32(0); auto abs_a = a.abs_2_(); auto abs_b = b.abs_2_(); @@ -992,8 +1482,14 @@ Vectorized> inline maximum(const Vectorized +<<<<<<< HEAD Vectorized> inline minimum(const Vectorized>& a, const Vectorized>& b) { +======= +Vectorized> inline minimum( + const Vectorized>& a, + const Vectorized>& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) auto zero_vector = _mm512_set1_epi32(0); auto abs_a = a.abs_2_(); auto abs_b = b.abs_2_(); @@ -1006,37 +1502,76 @@ Vectorized> inline minimum(const Vectorized +<<<<<<< HEAD Vectorized> inline operator&(const Vectorized>& a, const Vectorized>& b) { +======= +Vectorized> inline operator&( + const Vectorized>& a, + const Vectorized>& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return _mm512_and_ps(a, b); } template <> +<<<<<<< HEAD Vectorized> inline operator|(const Vectorized>& a, const Vectorized>& b) { +======= +Vectorized> inline operator|( + const Vectorized>& a, + const Vectorized>& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return _mm512_or_ps(a, b); } template <> +<<<<<<< HEAD Vectorized> inline operator^(const Vectorized>& a, const Vectorized>& b) { +======= +Vectorized> inline operator^( + const Vectorized>& a, + const Vectorized>& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return _mm512_xor_ps(a, b); } inline Vectorized> Vectorized>::eq( const Vectorized>& other) const { +<<<<<<< HEAD auto eq = (*this == other); // compares real and imag individually // If both real numbers and imag numbers are equal, then the complex numbers are equal return (eq.real() & eq.imag()) & Vectorized>(_mm512_set1_ps(1.0f)); +======= + auto eq = (*this == other); // compares real and imag individually + // If both real numbers and imag numbers are equal, then the complex numbers + // are equal + return (eq.real() & eq.imag()) & + Vectorized>(_mm512_set1_ps(1.0f)); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } inline Vectorized> Vectorized>::ne( const Vectorized>& other) const { +<<<<<<< HEAD auto ne = (*this != other); // compares real and imag individually // If either real numbers or imag numbers are not equal, then the complex numbers are not equal return (ne.real() | ne.imag()) & Vectorized>(_mm512_set1_ps(1.0f)); +======= + auto ne = (*this != other); // compares real and imag individually + // If either real numbers or imag numbers are not equal, then the complex + // numbers are not equal + return (ne.real() | ne.imag()) & + Vectorized>(_mm512_set1_ps(1.0f)); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } #endif +<<<<<<< HEAD }} +======= +} // namespace CPU_CAPABILITY +} // namespace at::vec +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_convert.h b/aten/src/ATen/cpu/vec/vec512/vec512_convert.h index af4801cccf48..a39b5b665e90 100644 --- a/aten/src/ATen/cpu/vec/vec512/vec512_convert.h +++ b/aten/src/ATen/cpu/vec/vec512/vec512_convert.h @@ -159,7 +159,11 @@ template <> struct VecConvert { static inline VectorizedN apply( const VectorizedN& src) { +<<<<<<< HEAD return Vectorized(_mm512_cvttps_epi32(src[0])); +======= + return Vectorized(_mm512_cvttps_epi32(src[0])); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } }; @@ -167,7 +171,11 @@ template <> struct VecConvert { static inline VectorizedN apply( const VectorizedN& src) { +<<<<<<< HEAD return Vectorized(_mm512_cvtepi32_ps(src[0])); +======= + return Vectorized(_mm512_cvtepi32_ps(src[0])); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } }; @@ -220,13 +228,24 @@ struct VecConvert< 1, float, 2, +<<<<<<< HEAD typename std::enable_if_t, void>> { +======= + typename std::enable_if_t, void>> { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) static inline VectorizedN apply(const VectorizedN& src) { at::vec::Vectorized vec1 = convert_float_to_int8(src[0]); at::vec::Vectorized vec2 = convert_float_to_int8(src[1]); __m128 lane2 = _mm512_castps512_ps128(_mm512_castsi512_ps(vec2)); +<<<<<<< HEAD __m512 result = _mm512_insertf32x4(_mm512_castsi512_ps(vec1), lane2, 1); // Insert lane2 into the second 128-bit lane +======= + __m512 result = _mm512_insertf32x4( + _mm512_castsi512_ps(vec1), + lane2, + 1); // Insert lane2 into the second 128-bit lane +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return at::vec::Vectorized(_mm512_castps_si512(result)); } }; @@ -237,8 +256,12 @@ struct VecConvert< 1, float, 1, +<<<<<<< HEAD typename std::enable_if_t, void>> { +======= + typename std::enable_if_t, void>> { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) static inline VectorizedN apply(const VectorizedN& src) { return convert_float_to_int8(src[0]); } @@ -250,6 +273,7 @@ struct VecConvert< 2, src_t, 1, +<<<<<<< HEAD typename std::enable_if_t, void>> { static inline VectorizedN apply(const VectorizedN& src) { @@ -259,6 +283,17 @@ struct VecConvert< ) ); return VectorizedN(convert_int8_to_float(src[0]), convert_int8_to_float(src2)); +======= + typename std::enable_if_t, void>> { + static inline VectorizedN apply(const VectorizedN& src) { + __m512i src2 = + _mm512_castsi128_si512(_mm_castps_si128(_mm512_extractf32x4_ps( + _mm512_castsi512_ps(src[0]), 1) // Extract the second 128-bit lane + )); + return VectorizedN( + convert_int8_to_float(src[0]), + convert_int8_to_float(src2)); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } }; @@ -268,8 +303,12 @@ struct VecConvert< 1, src_t, 1, +<<<<<<< HEAD typename std::enable_if_t, void>> { +======= + typename std::enable_if_t, void>> { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) static inline VectorizedN apply(const VectorizedN& src) { return convert_int8_to_float(src[0]); } @@ -282,8 +321,12 @@ struct VecConvert< int64_t, 2, std::enable_if_t< +<<<<<<< HEAD std::is_same_v || std::is_same_v>> { +======= + std::is_same_v || std::is_same_v>> { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) static inline VectorizedN apply( const VectorizedN& src) { return VecConvert::apply( @@ -291,6 +334,53 @@ struct VecConvert< } }; +<<<<<<< HEAD +======= +template <> +struct VecConvert { + static inline VectorizedN apply( + const VectorizedN& src_n) { + at::vec::Vectorized src = src_n[0]; + __m128i res128 = cvtfp32_fp8e4m3(src); + return at::vec::Vectorized(_mm512_castsi128_si512(res128)); + } +}; + +template <> +struct VecConvert { + static inline VectorizedN apply( + const VectorizedN& src_n) { + // cvt first 16x8 bits from Float8_e4m3fn to float + at::vec::Vectorized src = src_n[0]; + __m512 result; + cvtfp8e4m3_fp32(_mm512_castsi512_si128(src), result); + return at::vec::Vectorized(result); + } +}; + +template <> +struct VecConvert { + static inline VectorizedN apply( + const VectorizedN& src_n) { + at::vec::Vectorized src = src_n[0]; + __m128i res128 = cvtfp32_fp8e5m2(src); + return at::vec::Vectorized(_mm512_castsi128_si512(res128)); + } +}; + +template <> +struct VecConvert { + static inline VectorizedN apply( + const VectorizedN& src_n) { + // cvt first 16x8 bits from Float8_e5m2 to float + at::vec::Vectorized src = src_n[0]; + __m512 result; + cvtfp8e5m2_fp32(_mm512_castsi512_si128(src), result); + return at::vec::Vectorized(result); + } +}; + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #endif } // namespace CPU_CAPABILITY diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_double.h b/aten/src/ATen/cpu/vec/vec512/vec512_double.h index 4d2554f231d4..150c03ee8a2e 100644 --- a/aten/src/ATen/cpu/vec/vec512/vec512_double.h +++ b/aten/src/ATen/cpu/vec/vec512/vec512_double.h @@ -11,17 +11,32 @@ #include #endif +<<<<<<< HEAD +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) namespace at::vec { // See Note [CPU_CAPABILITY namespace] inline namespace CPU_CAPABILITY { #if defined(CPU_CAPABILITY_AVX512) +<<<<<<< HEAD template <> class Vectorized { private: static constexpr __m512i zero_vector {0, 0, 0, 0, 0, 0, 0, 0}; public: +======= +template <> +struct is_vec_specialized_for : std::bool_constant {}; + +template <> +class Vectorized { + private: + static constexpr __m512i zero_vector{0, 0, 0, 0, 0, 0, 0, 0}; + + public: +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // values needs to be public for compilation with clang // as vec512.h uses it __m512d values; @@ -35,14 +50,27 @@ template <> class Vectorized { Vectorized(double val) { values = _mm512_set1_pd(val); } +<<<<<<< HEAD Vectorized(double val1, double val2, double val3, double val4, double val5, double val6, double val7, double val8) { +======= + Vectorized( + double val1, + double val2, + double val3, + double val4, + double val5, + double val6, + double val7, + double val8) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) values = _mm512_setr_pd(val1, val2, val3, val4, val5, val6, val7, val8); } operator __m512d() const { return values; } template +<<<<<<< HEAD static Vectorized blend(const Vectorized& a, const Vectorized& b) { return _mm512_mask_blend_pd(mask, a.values, b.values); } @@ -60,6 +88,40 @@ template <> class Vectorized { } static Vectorized set(const Vectorized& a, const Vectorized& b, int64_t count = size()) { +======= + static Vectorized blend( + const Vectorized& a, + const Vectorized& b) { + return _mm512_mask_blend_pd(mask, a.values, b.values); + } + static Vectorized blendv( + const Vectorized& a, + const Vectorized& b, + const Vectorized& mask) { + auto all_ones = _mm512_set1_epi64(0xFFFFFFFFFFFFFFFF); + auto mmask = _mm512_cmp_epi64_mask( + _mm512_castpd_si512(mask.values), all_ones, _MM_CMPINT_EQ); + return _mm512_mask_blend_pd(mmask, a.values, b.values); + } + template + static Vectorized arange( + double base = 0., + step_t step = static_cast(1)) { + return Vectorized( + base, + base + step, + base + 2 * step, + base + 3 * step, + base + 4 * step, + base + 5 * step, + base + 6 * step, + base + 7 * step); + } + static Vectorized set( + const Vectorized& a, + const Vectorized& b, + int64_t count = size()) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) switch (count) { case 0: return a; @@ -95,14 +157,23 @@ template <> class Vectorized { _mm512_mask_storeu_pd(reinterpret_cast(ptr), mask, values); } } +<<<<<<< HEAD const double& operator[](int idx) const = delete; double& operator[](int idx) = delete; int zero_mask() const { // returns an integer mask where all zero elements are translated to 1-bit and others are translated to 0-bit +======= + const double& operator[](int idx) const = delete; + double& operator[](int idx) = delete; + int zero_mask() const { + // returns an integer mask where all zero elements are translated to 1-bit + // and others are translated to 0-bit +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) __mmask8 cmp = _mm512_cmp_pd_mask(values, _mm512_set1_pd(0.0), _CMP_EQ_OQ); return static_cast(cmp); } Vectorized isnan() const { +<<<<<<< HEAD auto cmp_mask = _mm512_cmp_pd_mask(values, _mm512_set1_pd(0.0), _CMP_UNORD_Q); return _mm512_castsi512_pd(_mm512_mask_set1_epi64(zero_vector, cmp_mask, 0xFFFFFFFFFFFFFFFF)); @@ -110,6 +181,17 @@ template <> class Vectorized { bool has_inf_nan() const { __m512d self_sub = _mm512_sub_pd(values, values); return (_mm512_movepi8_mask(_mm512_castpd_si512(self_sub)) & 0x7777777777777777) != 0; +======= + auto cmp_mask = + _mm512_cmp_pd_mask(values, _mm512_set1_pd(0.0), _CMP_UNORD_Q); + return _mm512_castsi512_pd( + _mm512_mask_set1_epi64(zero_vector, cmp_mask, 0xFFFFFFFFFFFFFFFF)); + } + bool has_inf_nan() const { + __m512d self_sub = _mm512_sub_pd(values, values); + return (_mm512_movepi8_mask(_mm512_castpd_si512(self_sub)) & + 0x7777777777777777) != 0; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } Vectorized map(double (*const f)(double)) const { __at_align__ double tmp[size()]; @@ -127,10 +209,17 @@ template <> class Vectorized { const auto zero_vec = _mm512_castsi512_pd(zero_vector); const auto nan_vec = _mm512_set1_pd(NAN); const auto not_nan_mask = _mm512_cmp_pd_mask(values, values, _CMP_EQ_OQ); +<<<<<<< HEAD const auto not_nan = _mm512_mask_set1_epi64(zero_vector, not_nan_mask, 0xFFFFFFFFFFFFFFFF); const auto nan_mask = _mm512_cmp_pd_mask(_mm512_castsi512_pd(not_nan), zero_vec, _CMP_EQ_OQ); +======= + const auto not_nan = + _mm512_mask_set1_epi64(zero_vector, not_nan_mask, 0xFFFFFFFFFFFFFFFF); + const auto nan_mask = + _mm512_cmp_pd_mask(_mm512_castsi512_pd(not_nan), zero_vec, _CMP_EQ_OQ); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) const auto pi = _mm512_set1_pd(c10::pi); const auto neg_mask = _mm512_cmp_pd_mask(values, zero_vec, _CMP_LT_OQ); @@ -165,10 +254,17 @@ template <> class Vectorized { Vectorized atanh() const { return Vectorized(Sleef_atanhd8_u10(values)); } +<<<<<<< HEAD Vectorized atan2(const Vectorized &b) const { return Vectorized(Sleef_atan2d8_u10(values, b)); } Vectorized copysign(const Vectorized &sign) const { +======= + Vectorized atan2(const Vectorized& b) const { + return Vectorized(Sleef_atan2d8_u10(values, b)); + } + Vectorized copysign(const Vectorized& sign) const { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return Vectorized(Sleef_copysignd8(values, sign)); } Vectorized erf() const { @@ -195,7 +291,11 @@ template <> class Vectorized { Vectorized fmod(const Vectorized& q) const { return Vectorized(Sleef_fmodd8(values, q)); } +<<<<<<< HEAD Vectorized hypot(const Vectorized &b) const { +======= + Vectorized hypot(const Vectorized& b) const { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return Vectorized(Sleef_hypotd8_u05(values, b)); } Vectorized i0() const { @@ -207,7 +307,11 @@ template <> class Vectorized { Vectorized digamma() const { return map(calc_digamma); } +<<<<<<< HEAD Vectorized igamma(const Vectorized &x) const { +======= + Vectorized igamma(const Vectorized& x) const { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) __at_align__ double tmp[size()]; __at_align__ double tmp_x[size()]; store(tmp); @@ -217,7 +321,11 @@ template <> class Vectorized { } return loadu(tmp); } +<<<<<<< HEAD Vectorized igammac(const Vectorized &x) const { +======= + Vectorized igammac(const Vectorized& x) const { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) __at_align__ double tmp[size()]; __at_align__ double tmp_x[size()]; store(tmp); @@ -261,11 +369,20 @@ template <> class Vectorized { Vectorized neg() const { return _mm512_xor_pd(_mm512_set1_pd(-0.), values); } +<<<<<<< HEAD Vectorized nextafter(const Vectorized &b) const { return Vectorized(Sleef_nextafterd8(values, b)); } Vectorized round() const { return _mm512_roundscale_pd(values, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)); +======= + Vectorized nextafter(const Vectorized& b) const { + return Vectorized(Sleef_nextafterd8(values, b)); + } + Vectorized round() const { + return _mm512_roundscale_pd( + values, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } Vectorized tan() const { return Vectorized(Sleef_tand8_u10(values)); @@ -274,7 +391,12 @@ template <> class Vectorized { return Vectorized(Sleef_tanhd8_u10(values)); } Vectorized trunc() const { +<<<<<<< HEAD return _mm512_roundscale_pd(values, (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)); +======= + return _mm512_roundscale_pd( + values, (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } Vectorized lgamma() const { return Vectorized(Sleef_lgammad8_u10(values)); @@ -288,7 +410,11 @@ template <> class Vectorized { Vectorized rsqrt() const { return _mm512_div_pd(_mm512_set1_pd(1), _mm512_sqrt_pd(values)); } +<<<<<<< HEAD Vectorized pow(const Vectorized &b) const { +======= + Vectorized pow(const Vectorized& b) const { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return Vectorized(Sleef_powd8_u10(values, b)); } // Comparison using the _CMP_**_OQ predicate. @@ -296,38 +422,68 @@ template <> class Vectorized { // `Q`: do not raise if an operand is NaN Vectorized operator==(const Vectorized& other) const { auto cmp_mask = _mm512_cmp_pd_mask(values, other.values, _CMP_EQ_OQ); +<<<<<<< HEAD return _mm512_castsi512_pd(_mm512_mask_set1_epi64(zero_vector, cmp_mask, 0xFFFFFFFFFFFFFFFF)); +======= + return _mm512_castsi512_pd( + _mm512_mask_set1_epi64(zero_vector, cmp_mask, 0xFFFFFFFFFFFFFFFF)); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } Vectorized operator!=(const Vectorized& other) const { auto cmp_mask = _mm512_cmp_pd_mask(values, other.values, _CMP_NEQ_UQ); +<<<<<<< HEAD return _mm512_castsi512_pd(_mm512_mask_set1_epi64(zero_vector, cmp_mask, 0xFFFFFFFFFFFFFFFF)); +======= + return _mm512_castsi512_pd( + _mm512_mask_set1_epi64(zero_vector, cmp_mask, 0xFFFFFFFFFFFFFFFF)); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } Vectorized operator<(const Vectorized& other) const { auto cmp_mask = _mm512_cmp_pd_mask(values, other.values, _CMP_LT_OQ); +<<<<<<< HEAD return _mm512_castsi512_pd(_mm512_mask_set1_epi64(zero_vector, cmp_mask, 0xFFFFFFFFFFFFFFFF)); +======= + return _mm512_castsi512_pd( + _mm512_mask_set1_epi64(zero_vector, cmp_mask, 0xFFFFFFFFFFFFFFFF)); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } Vectorized operator<=(const Vectorized& other) const { auto cmp_mask = _mm512_cmp_pd_mask(values, other.values, _CMP_LE_OQ); +<<<<<<< HEAD return _mm512_castsi512_pd(_mm512_mask_set1_epi64(zero_vector, cmp_mask, 0xFFFFFFFFFFFFFFFF)); +======= + return _mm512_castsi512_pd( + _mm512_mask_set1_epi64(zero_vector, cmp_mask, 0xFFFFFFFFFFFFFFFF)); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } Vectorized operator>(const Vectorized& other) const { auto cmp_mask = _mm512_cmp_pd_mask(values, other.values, _CMP_GT_OQ); +<<<<<<< HEAD return _mm512_castsi512_pd(_mm512_mask_set1_epi64(zero_vector, cmp_mask, 0xFFFFFFFFFFFFFFFF)); +======= + return _mm512_castsi512_pd( + _mm512_mask_set1_epi64(zero_vector, cmp_mask, 0xFFFFFFFFFFFFFFFF)); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } Vectorized operator>=(const Vectorized& other) const { auto cmp_mask = _mm512_cmp_pd_mask(values, other.values, _CMP_GE_OQ); +<<<<<<< HEAD return _mm512_castsi512_pd(_mm512_mask_set1_epi64(zero_vector, cmp_mask, 0xFFFFFFFFFFFFFFFF)); +======= + return _mm512_castsi512_pd( + _mm512_mask_set1_epi64(zero_vector, cmp_mask, 0xFFFFFFFFFFFFFFFF)); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } Vectorized eq(const Vectorized& other) const; @@ -339,22 +495,46 @@ template <> class Vectorized { }; template <> +<<<<<<< HEAD Vectorized inline operator+(const Vectorized& a, const Vectorized& b) { +======= +Vectorized inline operator+( + const Vectorized& a, + const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return _mm512_add_pd(a, b); } template <> +<<<<<<< HEAD Vectorized inline operator-(const Vectorized& a, const Vectorized& b) { +======= +Vectorized inline operator-( + const Vectorized& a, + const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return _mm512_sub_pd(a, b); } template <> +<<<<<<< HEAD Vectorized inline operator*(const Vectorized& a, const Vectorized& b) { +======= +Vectorized inline operator*( + const Vectorized& a, + const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return _mm512_mul_pd(a, b); } template <> +<<<<<<< HEAD Vectorized inline operator/(const Vectorized& a, const Vectorized& b) { +======= +Vectorized inline operator/( + const Vectorized& a, + const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return _mm512_div_pd(a, b); } @@ -366,12 +546,23 @@ inline Vectorized Vectorized::frac() const { // Implements the IEEE 754 201X `maximum` operation, which propagates NaN if // either input is a NaN. template <> +<<<<<<< HEAD Vectorized inline maximum(const Vectorized& a, const Vectorized& b) { auto zero_vec = _mm512_set1_epi64(0); Vectorized max = _mm512_max_pd(a, b); auto isnan_mask = _mm512_cmp_pd_mask(a, b, _CMP_UNORD_Q); auto isnan = _mm512_castsi512_pd(_mm512_mask_set1_epi64(zero_vec, isnan_mask, 0xFFFFFFFFFFFFFFFF)); +======= +Vectorized inline maximum( + const Vectorized& a, + const Vectorized& b) { + auto zero_vec = _mm512_set1_epi64(0); + Vectorized max = _mm512_max_pd(a, b); + auto isnan_mask = _mm512_cmp_pd_mask(a, b, _CMP_UNORD_Q); + auto isnan = _mm512_castsi512_pd( + _mm512_mask_set1_epi64(zero_vec, isnan_mask, 0xFFFFFFFFFFFFFFFF)); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // Exploit the fact that all-ones is a NaN. return _mm512_or_pd(max, isnan); } @@ -379,42 +570,85 @@ Vectorized inline maximum(const Vectorized& a, const Vectorized< // Implements the IEEE 754 201X `minimum` operation, which propagates NaN if // either input is a NaN. template <> +<<<<<<< HEAD Vectorized inline minimum(const Vectorized& a, const Vectorized& b) { auto zero_vec = _mm512_set1_epi64(0); Vectorized min = _mm512_min_pd(a, b); auto isnan_mask = _mm512_cmp_pd_mask(a, b, _CMP_UNORD_Q); auto isnan = _mm512_castsi512_pd(_mm512_mask_set1_epi64(zero_vec, isnan_mask, 0xFFFFFFFFFFFFFFFF)); +======= +Vectorized inline minimum( + const Vectorized& a, + const Vectorized& b) { + auto zero_vec = _mm512_set1_epi64(0); + Vectorized min = _mm512_min_pd(a, b); + auto isnan_mask = _mm512_cmp_pd_mask(a, b, _CMP_UNORD_Q); + auto isnan = _mm512_castsi512_pd( + _mm512_mask_set1_epi64(zero_vec, isnan_mask, 0xFFFFFFFFFFFFFFFF)); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // Exploit the fact that all-ones is a NaN. return _mm512_or_pd(min, isnan); } template <> +<<<<<<< HEAD Vectorized inline clamp(const Vectorized& a, const Vectorized& min, const Vectorized& max) { +======= +Vectorized inline clamp( + const Vectorized& a, + const Vectorized& min, + const Vectorized& max) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return _mm512_min_pd(max, _mm512_max_pd(min, a)); } template <> +<<<<<<< HEAD Vectorized inline clamp_min(const Vectorized& a, const Vectorized& min) { +======= +Vectorized inline clamp_min( + const Vectorized& a, + const Vectorized& min) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return _mm512_max_pd(min, a); } template <> +<<<<<<< HEAD Vectorized inline clamp_max(const Vectorized& a, const Vectorized& max) { +======= +Vectorized inline clamp_max( + const Vectorized& a, + const Vectorized& max) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return _mm512_min_pd(max, a); } template <> +<<<<<<< HEAD Vectorized inline operator&(const Vectorized& a, const Vectorized& b) { +======= +Vectorized inline operator&( + const Vectorized& a, + const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return _mm512_and_pd(a, b); } template <> +<<<<<<< HEAD Vectorized inline operator|(const Vectorized& a, const Vectorized& b) { +======= +Vectorized inline operator|( + const Vectorized& a, + const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return _mm512_or_pd(a, b); } template <> +<<<<<<< HEAD Vectorized inline operator^(const Vectorized& a, const Vectorized& b) { return _mm512_xor_pd(a, b); } @@ -440,6 +674,41 @@ inline Vectorized Vectorized::lt(const Vectorized& other } inline Vectorized Vectorized::le(const Vectorized& other) const { +======= +Vectorized inline operator^( + const Vectorized& a, + const Vectorized& b) { + return _mm512_xor_pd(a, b); +} + +inline Vectorized Vectorized::eq( + const Vectorized& other) const { + return (*this == other) & Vectorized(1.0); +} + +inline Vectorized Vectorized::ne( + const Vectorized& other) const { + return (*this != other) & Vectorized(1.0); +} + +inline Vectorized Vectorized::gt( + const Vectorized& other) const { + return (*this > other) & Vectorized(1.0); +} + +inline Vectorized Vectorized::ge( + const Vectorized& other) const { + return (*this >= other) & Vectorized(1.0); +} + +inline Vectorized Vectorized::lt( + const Vectorized& other) const { + return (*this < other) & Vectorized(1.0); +} + +inline Vectorized Vectorized::le( + const Vectorized& other) const { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return (*this <= other) & Vectorized(1.0); } @@ -449,7 +718,12 @@ inline void convert(const double* src, double* dst, int64_t n) { #ifndef __msvc_cl__ #pragma unroll #endif +<<<<<<< HEAD for (i = 0; i <= (n - Vectorized::size()); i += Vectorized::size()) { +======= + for (i = 0; i <= (n - Vectorized::size()); + i += Vectorized::size()) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) _mm512_storeu_pd(dst + i, _mm512_loadu_pd(src + i)); } #ifndef __msvc_cl__ @@ -461,15 +735,34 @@ inline void convert(const double* src, double* dst, int64_t n) { } template <> +<<<<<<< HEAD Vectorized inline fmadd(const Vectorized& a, const Vectorized& b, const Vectorized& c) { +======= +Vectorized inline fmadd( + const Vectorized& a, + const Vectorized& b, + const Vectorized& c) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return _mm512_fmadd_pd(a, b, c); } template <> +<<<<<<< HEAD Vectorized inline fmsub(const Vectorized& a, const Vectorized& b, const Vectorized& c) { +======= +Vectorized inline fmsub( + const Vectorized& a, + const Vectorized& b, + const Vectorized& c) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return _mm512_fmsub_pd(a, b, c); } #endif +<<<<<<< HEAD }} +======= +} // namespace CPU_CAPABILITY +} // namespace at::vec +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_float.h b/aten/src/ATen/cpu/vec/vec512/vec512_float.h index 43a8e5c48cbe..d9627b416d8f 100644 --- a/aten/src/ATen/cpu/vec/vec512/vec512_float.h +++ b/aten/src/ATen/cpu/vec/vec512/vec512_float.h @@ -11,17 +11,32 @@ #include #endif +<<<<<<< HEAD +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) namespace at::vec { // See Note [CPU_CAPABILITY namespace] inline namespace CPU_CAPABILITY { #if defined(CPU_CAPABILITY_AVX512) +<<<<<<< HEAD template <> class Vectorized { private: static constexpr __m512i zero_vec {0, 0, 0, 0, 0, 0, 0, 0}; public: +======= +template <> +struct is_vec_specialized_for : std::bool_constant {}; + +template <> +class Vectorized { + private: + static constexpr __m512i zero_vec{0, 0, 0, 0, 0, 0, 0, 0}; + + public: +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) __m512 values; using value_type = float; using size_type = int; @@ -33,6 +48,7 @@ template <> class Vectorized { Vectorized(float val) { values = _mm512_set1_ps(val); } +<<<<<<< HEAD Vectorized(float val1, float val2, float val3, float val4, float val5, float val6, float val7, float val8, float val9, float val10, float val11, float val12, @@ -43,10 +59,66 @@ template <> class Vectorized { Vectorized(const float (&arr)[16]) : Vectorized(arr[0], arr[1], arr[2], arr[3], arr[4], arr[5], arr[6], arr[7], arr[8], arr[9], arr[10], arr[11], arr[12], arr[13], arr[14], arr[15]) {} +======= + Vectorized( + float val1, + float val2, + float val3, + float val4, + float val5, + float val6, + float val7, + float val8, + float val9, + float val10, + float val11, + float val12, + float val13, + float val14, + float val15, + float val16) { + values = _mm512_setr_ps( + val1, + val2, + val3, + val4, + val5, + val6, + val7, + val8, + val9, + val10, + val11, + val12, + val13, + val14, + val15, + val16); + } + Vectorized(const float (&arr)[16]) + : Vectorized( + arr[0], + arr[1], + arr[2], + arr[3], + arr[4], + arr[5], + arr[6], + arr[7], + arr[8], + arr[9], + arr[10], + arr[11], + arr[12], + arr[13], + arr[14], + arr[15]) {} +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) operator __m512() const { return values; } template +<<<<<<< HEAD static Vectorized blend(const Vectorized& a, const Vectorized& b) { return _mm512_mask_blend_ps(mask, a.values, b.values); } @@ -66,6 +138,48 @@ template <> class Vectorized { } static Vectorized set(const Vectorized& a, const Vectorized& b, int64_t count = size()) { +======= + static Vectorized blend( + const Vectorized& a, + const Vectorized& b) { + return _mm512_mask_blend_ps(mask, a.values, b.values); + } + static Vectorized blendv( + const Vectorized& a, + const Vectorized& b, + const Vectorized& mask) { + auto all_ones = _mm512_set1_epi32(0xFFFFFFFF); + auto mmask = _mm512_cmp_epi32_mask( + _mm512_castps_si512(mask.values), all_ones, _MM_CMPINT_EQ); + return _mm512_mask_blend_ps(mmask, a.values, b.values); + } + template + static Vectorized arange( + float base = 0.f, + step_t step = static_cast(1)) { + return Vectorized( + base, + base + step, + base + 2 * step, + base + 3 * step, + base + 4 * step, + base + 5 * step, + base + 6 * step, + base + 7 * step, + base + 8 * step, + base + 9 * step, + base + 10 * step, + base + 11 * step, + base + 12 * step, + base + 13 * step, + base + 14 * step, + base + 15 * step); + } + static Vectorized set( + const Vectorized& a, + const Vectorized& b, + int64_t count = size()) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) switch (count) { case 0: return a; @@ -117,14 +231,23 @@ template <> class Vectorized { _mm512_mask_storeu_ps(reinterpret_cast(ptr), mask, values); } } +<<<<<<< HEAD const float& operator[](int idx) const = delete; float& operator[](int idx) = delete; int zero_mask() const { // returns an integer mask where all zero elements are translated to 1-bit and others are translated to 0-bit +======= + const float& operator[](int idx) const = delete; + float& operator[](int idx) = delete; + int zero_mask() const { + // returns an integer mask where all zero elements are translated to 1-bit + // and others are translated to 0-bit +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) __mmask16 cmp = _mm512_cmp_ps_mask(values, _mm512_set1_ps(0.0), _CMP_EQ_OQ); return static_cast(cmp); } Vectorized isnan() const { +<<<<<<< HEAD auto mask = _mm512_cmp_ps_mask(values, _mm512_set1_ps(0.0), _CMP_UNORD_Q); return _mm512_castsi512_ps(_mm512_mask_set1_epi32(zero_vec, mask, 0xFFFFFFFF)); @@ -132,6 +255,16 @@ template <> class Vectorized { bool has_inf_nan() const { __m512 self_sub = _mm512_sub_ps(values, values); return (_mm512_movepi8_mask(_mm512_castps_si512(self_sub)) & 0x7777777777777777) != 0; +======= + auto mask = _mm512_cmp_ps_mask(values, _mm512_set1_ps(0.0), _CMP_UNORD_Q); + return _mm512_castsi512_ps( + _mm512_mask_set1_epi32(zero_vec, mask, 0xFFFFFFFF)); + } + bool has_inf_nan() const { + __m512 self_sub = _mm512_sub_ps(values, values); + return (_mm512_movepi8_mask(_mm512_castps_si512(self_sub)) & + 0x7777777777777777) != 0; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } Vectorized map(float (*const f)(float)) const { __at_align__ float tmp[size()]; @@ -149,10 +282,17 @@ template <> class Vectorized { __m512 zero_vec = _mm512_set1_ps(0.f); const auto nan_vec = _mm512_set1_ps(NAN); const auto not_nan_mask = _mm512_cmp_ps_mask(values, values, _CMP_EQ_OQ); +<<<<<<< HEAD const auto not_nan_vec = _mm512_mask_set1_epi32(_mm512_castps_si512(zero_vec), not_nan_mask, 0xFFFFFFFF); const auto nan_mask = _mm512_cmp_ps_mask(_mm512_castsi512_ps(not_nan_vec), zero_vec, _CMP_EQ_OQ); +======= + const auto not_nan_vec = _mm512_mask_set1_epi32( + _mm512_castps_si512(zero_vec), not_nan_mask, 0xFFFFFFFF); + const auto nan_mask = _mm512_cmp_ps_mask( + _mm512_castsi512_ps(not_nan_vec), zero_vec, _CMP_EQ_OQ); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) const auto pi = _mm512_set1_ps(c10::pi); const auto neg_mask = _mm512_cmp_ps_mask(values, zero_vec, _CMP_LT_OQ); @@ -187,10 +327,17 @@ template <> class Vectorized { Vectorized atanh() const { return Vectorized(Sleef_atanhf16_u10(values)); } +<<<<<<< HEAD Vectorized atan2(const Vectorized &b) const { return Vectorized(Sleef_atan2f16_u10(values, b)); } Vectorized copysign(const Vectorized &sign) const { +======= + Vectorized atan2(const Vectorized& b) const { + return Vectorized(Sleef_atan2f16_u10(values, b)); + } + Vectorized copysign(const Vectorized& sign) const { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return Vectorized(Sleef_copysignf16(values, sign)); } Vectorized erf() const { @@ -258,9 +405,18 @@ template <> class Vectorized { const __m512 vec_one = _mm512_set1_ps(1.f); const __m512 vec_zero = _mm512_set1_ps(0.f); const __m512 vec_two = _mm512_set1_ps(2.f); +<<<<<<< HEAD const __m512 vec_ln2f = _mm512_castsi512_ps(_mm512_set1_epi32(0x3f317218)); // ln(2) const __m512 vec_ln_flt_min = _mm512_castsi512_ps(_mm512_set1_epi32(0xc2aeac50)); const __m512 vec_ln_flt_max = _mm512_castsi512_ps(_mm512_set1_epi32(0x42b17218)); +======= + const __m512 vec_ln2f = + _mm512_castsi512_ps(_mm512_set1_epi32(0x3f317218)); // ln(2) + const __m512 vec_ln_flt_min = + _mm512_castsi512_ps(_mm512_set1_epi32(0xc2aeac50)); + const __m512 vec_ln_flt_max = + _mm512_castsi512_ps(_mm512_set1_epi32(0x42b17218)); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) const __m512i vec_127 = _mm512_set1_epi32(0x0000007f); const int n_mantissa_bits = 23; @@ -338,7 +494,11 @@ template <> class Vectorized { Vectorized floor() const { return _mm512_floor_ps(values); } +<<<<<<< HEAD Vectorized hypot(const Vectorized &b) const { +======= + Vectorized hypot(const Vectorized& b) const { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return Vectorized(Sleef_hypotf16_u05(values, b)); } Vectorized i0() const { @@ -350,7 +510,11 @@ template <> class Vectorized { Vectorized digamma() const { return map(calc_digamma); } +<<<<<<< HEAD Vectorized igamma(const Vectorized &x) const { +======= + Vectorized igamma(const Vectorized& x) const { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) __at_align__ float tmp[size()]; __at_align__ float tmp_x[size()]; store(tmp); @@ -360,7 +524,11 @@ template <> class Vectorized { } return loadu(tmp); } +<<<<<<< HEAD Vectorized igammac(const Vectorized &x) const { +======= + Vectorized igammac(const Vectorized& x) const { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) __at_align__ float tmp[size()]; __at_align__ float tmp_x[size()]; store(tmp); @@ -373,11 +541,20 @@ template <> class Vectorized { Vectorized neg() const { return _mm512_xor_ps(_mm512_set1_ps(-0.f), values); } +<<<<<<< HEAD Vectorized nextafter(const Vectorized &b) const { return Vectorized(Sleef_nextafterf16(values, b)); } Vectorized round() const { return _mm512_roundscale_ps(values, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)); +======= + Vectorized nextafter(const Vectorized& b) const { + return Vectorized(Sleef_nextafterf16(values, b)); + } + Vectorized round() const { + return _mm512_roundscale_ps( + values, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } Vectorized tan() const { return Vectorized(Sleef_tanf16_u10(values)); @@ -386,7 +563,12 @@ template <> class Vectorized { return Vectorized(Sleef_tanhf16_u10(values)); } Vectorized trunc() const { +<<<<<<< HEAD return _mm512_roundscale_ps(values, (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)); +======= + return _mm512_roundscale_ps( + values, (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } Vectorized lgamma() const { return Vectorized(Sleef_lgammaf16_u10(values)); @@ -400,7 +582,11 @@ template <> class Vectorized { Vectorized rsqrt() const { return _mm512_div_ps(_mm512_set1_ps(1), _mm512_sqrt_ps(values)); } +<<<<<<< HEAD Vectorized pow(const Vectorized &b) const { +======= + Vectorized pow(const Vectorized& b) const { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return Vectorized(Sleef_powf16_u10(values, b)); } float reduce_add() const { @@ -414,38 +600,68 @@ template <> class Vectorized { // `Q`: do not raise if an operand is NaN Vectorized operator==(const Vectorized& other) const { auto mask = _mm512_cmp_ps_mask(values, other.values, _CMP_EQ_OQ); +<<<<<<< HEAD return _mm512_castsi512_ps(_mm512_mask_set1_epi32(zero_vec, mask, 0xFFFFFFFF)); +======= + return _mm512_castsi512_ps( + _mm512_mask_set1_epi32(zero_vec, mask, 0xFFFFFFFF)); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } Vectorized operator!=(const Vectorized& other) const { auto mask = _mm512_cmp_ps_mask(values, other.values, _CMP_NEQ_UQ); +<<<<<<< HEAD return _mm512_castsi512_ps(_mm512_mask_set1_epi32(zero_vec, mask, 0xFFFFFFFF)); +======= + return _mm512_castsi512_ps( + _mm512_mask_set1_epi32(zero_vec, mask, 0xFFFFFFFF)); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } Vectorized operator<(const Vectorized& other) const { auto mask = _mm512_cmp_ps_mask(values, other.values, _CMP_LT_OQ); +<<<<<<< HEAD return _mm512_castsi512_ps(_mm512_mask_set1_epi32(zero_vec, mask, 0xFFFFFFFF)); +======= + return _mm512_castsi512_ps( + _mm512_mask_set1_epi32(zero_vec, mask, 0xFFFFFFFF)); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } Vectorized operator<=(const Vectorized& other) const { auto mask = _mm512_cmp_ps_mask(values, other.values, _CMP_LE_OQ); +<<<<<<< HEAD return _mm512_castsi512_ps(_mm512_mask_set1_epi32(zero_vec, mask, 0xFFFFFFFF)); +======= + return _mm512_castsi512_ps( + _mm512_mask_set1_epi32(zero_vec, mask, 0xFFFFFFFF)); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } Vectorized operator>(const Vectorized& other) const { auto mask = _mm512_cmp_ps_mask(values, other.values, _CMP_GT_OQ); +<<<<<<< HEAD return _mm512_castsi512_ps(_mm512_mask_set1_epi32(zero_vec, mask, 0xFFFFFFFF)); +======= + return _mm512_castsi512_ps( + _mm512_mask_set1_epi32(zero_vec, mask, 0xFFFFFFFF)); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } Vectorized operator>=(const Vectorized& other) const { auto mask = _mm512_cmp_ps_mask(values, other.values, _CMP_GE_OQ); +<<<<<<< HEAD return _mm512_castsi512_ps(_mm512_mask_set1_epi32(zero_vec, mask, 0xFFFFFFFF)); +======= + return _mm512_castsi512_ps( + _mm512_mask_set1_epi32(zero_vec, mask, 0xFFFFFFFF)); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } Vectorized eq(const Vectorized& other) const; @@ -457,22 +673,46 @@ template <> class Vectorized { }; template <> +<<<<<<< HEAD Vectorized inline operator+(const Vectorized& a, const Vectorized& b) { +======= +Vectorized inline operator+( + const Vectorized& a, + const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return _mm512_add_ps(a, b); } template <> +<<<<<<< HEAD Vectorized inline operator-(const Vectorized& a, const Vectorized& b) { +======= +Vectorized inline operator-( + const Vectorized& a, + const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return _mm512_sub_ps(a, b); } template <> +<<<<<<< HEAD Vectorized inline operator*(const Vectorized& a, const Vectorized& b) { +======= +Vectorized inline operator*( + const Vectorized& a, + const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return _mm512_mul_ps(a, b); } template <> +<<<<<<< HEAD Vectorized inline operator/(const Vectorized& a, const Vectorized& b) { +======= +Vectorized inline operator/( + const Vectorized& a, + const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return _mm512_div_ps(a, b); } @@ -484,12 +724,23 @@ inline Vectorized Vectorized::frac() const { // Implements the IEEE 754 201X `maximum` operation, which propagates NaN if // either input is a NaN. template <> +<<<<<<< HEAD Vectorized inline maximum(const Vectorized& a, const Vectorized& b) { auto zero_vec = _mm512_set1_epi32(0); auto max = _mm512_max_ps(a, b); auto isnan_mask = _mm512_cmp_ps_mask(a, b, _CMP_UNORD_Q); auto isnan = _mm512_castsi512_ps(_mm512_mask_set1_epi32(zero_vec, isnan_mask, 0xFFFFFFFF)); +======= +Vectorized inline maximum( + const Vectorized& a, + const Vectorized& b) { + auto zero_vec = _mm512_set1_epi32(0); + auto max = _mm512_max_ps(a, b); + auto isnan_mask = _mm512_cmp_ps_mask(a, b, _CMP_UNORD_Q); + auto isnan = _mm512_castsi512_ps( + _mm512_mask_set1_epi32(zero_vec, isnan_mask, 0xFFFFFFFF)); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // Exploit the fact that all-ones is a NaN. return _mm512_or_ps(max, isnan); } @@ -497,42 +748,85 @@ Vectorized inline maximum(const Vectorized& a, const Vectorized +<<<<<<< HEAD Vectorized inline minimum(const Vectorized& a, const Vectorized& b) { auto zero_vec = _mm512_set1_epi32(0); auto min = _mm512_min_ps(a, b); auto isnan_mask = _mm512_cmp_ps_mask(a, b, _CMP_UNORD_Q); auto isnan = _mm512_castsi512_ps(_mm512_mask_set1_epi32(zero_vec, isnan_mask, 0xFFFFFFFF)); +======= +Vectorized inline minimum( + const Vectorized& a, + const Vectorized& b) { + auto zero_vec = _mm512_set1_epi32(0); + auto min = _mm512_min_ps(a, b); + auto isnan_mask = _mm512_cmp_ps_mask(a, b, _CMP_UNORD_Q); + auto isnan = _mm512_castsi512_ps( + _mm512_mask_set1_epi32(zero_vec, isnan_mask, 0xFFFFFFFF)); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // Exploit the fact that all-ones is a NaN. return _mm512_or_ps(min, isnan); } template <> +<<<<<<< HEAD Vectorized inline clamp(const Vectorized& a, const Vectorized& min, const Vectorized& max) { +======= +Vectorized inline clamp( + const Vectorized& a, + const Vectorized& min, + const Vectorized& max) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return _mm512_min_ps(max, _mm512_max_ps(min, a)); } template <> +<<<<<<< HEAD Vectorized inline clamp_max(const Vectorized& a, const Vectorized& max) { +======= +Vectorized inline clamp_max( + const Vectorized& a, + const Vectorized& max) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return _mm512_min_ps(max, a); } template <> +<<<<<<< HEAD Vectorized inline clamp_min(const Vectorized& a, const Vectorized& min) { +======= +Vectorized inline clamp_min( + const Vectorized& a, + const Vectorized& min) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return _mm512_max_ps(min, a); } template <> +<<<<<<< HEAD Vectorized inline operator&(const Vectorized& a, const Vectorized& b) { +======= +Vectorized inline operator&( + const Vectorized& a, + const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return _mm512_and_ps(a, b); } template <> +<<<<<<< HEAD Vectorized inline operator|(const Vectorized& a, const Vectorized& b) { +======= +Vectorized inline operator|( + const Vectorized& a, + const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return _mm512_or_ps(a, b); } template <> +<<<<<<< HEAD Vectorized inline operator^(const Vectorized& a, const Vectorized& b) { return _mm512_xor_ps(a, b); } @@ -558,6 +852,41 @@ inline Vectorized Vectorized::lt(const Vectorized& other) c } inline Vectorized Vectorized::le(const Vectorized& other) const { +======= +Vectorized inline operator^( + const Vectorized& a, + const Vectorized& b) { + return _mm512_xor_ps(a, b); +} + +inline Vectorized Vectorized::eq( + const Vectorized& other) const { + return (*this == other) & Vectorized(1.0f); +} + +inline Vectorized Vectorized::ne( + const Vectorized& other) const { + return (*this != other) & Vectorized(1.0f); +} + +inline Vectorized Vectorized::gt( + const Vectorized& other) const { + return (*this > other) & Vectorized(1.0f); +} + +inline Vectorized Vectorized::ge( + const Vectorized& other) const { + return (*this >= other) & Vectorized(1.0f); +} + +inline Vectorized Vectorized::lt( + const Vectorized& other) const { + return (*this < other) & Vectorized(1.0f); +} + +inline Vectorized Vectorized::le( + const Vectorized& other) const { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return (*this <= other) & Vectorized(1.0f); } @@ -567,7 +896,12 @@ inline void convert(const float* src, float* dst, int64_t n) { #ifndef __msvc_cl__ #pragma unroll #endif +<<<<<<< HEAD for (i = 0; i <= (n - Vectorized::size()); i += Vectorized::size()) { +======= + for (i = 0; i <= (n - Vectorized::size()); + i += Vectorized::size()) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) _mm512_storeu_ps(dst + i, _mm512_loadu_ps(src + i)); } #ifndef __msvc_cl__ @@ -579,12 +913,26 @@ inline void convert(const float* src, float* dst, int64_t n) { } template <> +<<<<<<< HEAD Vectorized inline fmadd(const Vectorized& a, const Vectorized& b, const Vectorized& c) { +======= +Vectorized inline fmadd( + const Vectorized& a, + const Vectorized& b, + const Vectorized& c) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return _mm512_fmadd_ps(a, b, c); } template <> +<<<<<<< HEAD Vectorized inline fmsub(const Vectorized& a, const Vectorized& b, const Vectorized& c) { +======= +Vectorized inline fmsub( + const Vectorized& a, + const Vectorized& b, + const Vectorized& c) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return _mm512_fmsub_ps(a, b, c); } @@ -594,7 +942,14 @@ Vectorized inline fmsub(const Vectorized& a, const Vectorized &input, int M=16, int N=16) { +======= +inline void transpose_block( + at::vec::VectorizedN& input, + int M = 16, + int N = 16) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) TORCH_CHECK(M <= 16 && N <= 16, "transpose_block expects M, N <= 16."); // unpacking and interleaving 32-bit elements __m512 temp[16]; @@ -653,7 +1008,17 @@ inline void transpose_block(at::vec::VectorizedN &input, int M=16, in // https://github.com/pytorch/FBGEMM/blob/39a423e4ad1a04b77fea81c7d09c3e6f8984fae9/src/UtilsAvx512.cc#L230-L304 // kernel for transposing mxn where m, n <= 16 // M + (M + 1) / 2 * 2 + (M + 3) / 4 * 4 + (M + 7) / 8 * 8 + 2 * N instructions +<<<<<<< HEAD inline void transpose_mxn_16x16(const float* src, int64_t ld_src, float* dst, int64_t ld_dst, int M, int N) { +======= +inline void transpose_mxn_16x16( + const float* src, + int64_t ld_src, + float* dst, + int64_t ld_dst, + int M, + int N) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) TORCH_CHECK(M <= 16 && N <= 16, "transpose_mxn expects M, N <= 16."); // load from src to registers at::vec::VectorizedN input; @@ -690,8 +1055,19 @@ inline void transpose_mxn_16x16(const float* src, int64_t ld_src, float* dst, in } } +<<<<<<< HEAD template<> inline void transpose_mxn(const float* src, int64_t ld_src, float* dst, int64_t ld_dst, int M, int N) { +======= +template <> +inline void transpose_mxn( + const float* src, + int64_t ld_src, + float* dst, + int64_t ld_dst, + int M, + int N) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) int64_t i = 0; for (; i < M / 16 * 16; i += 16) { int64_t j = 0; @@ -721,12 +1097,30 @@ inline void transpose_mxn(const float* src, int64_t ld_src, float* dst, i } } +<<<<<<< HEAD template , int> = 0> inline void transpose_mxn(const float* src, int64_t ld_src, float* dst, int64_t ld_dst) { +======= +template < + typename T, + int M, + int N, + typename std::enable_if_t, int> = 0> +inline void transpose_mxn( + const float* src, + int64_t ld_src, + float* dst, + int64_t ld_dst) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) transpose_mxn(src, ld_src, dst, ld_dst, M, N); } #endif +<<<<<<< HEAD }} +======= +} // namespace CPU_CAPABILITY +} // namespace at::vec +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_float8.h b/aten/src/ATen/cpu/vec/vec512/vec512_float8.h new file mode 100644 index 000000000000..12ee4c460641 --- /dev/null +++ b/aten/src/ATen/cpu/vec/vec512/vec512_float8.h @@ -0,0 +1,661 @@ +#pragma once + +// DO NOT DEFINE STATIC DATA IN THIS HEADER! +// See Note [Do not compile initializers with AVX] + +#include +#include +#if (defined(CPU_CAPABILITY_AVX512)) +#define SLEEF_STATIC_LIBS +#include +#endif + +namespace at::vec { +// See Note [CPU_CAPABILITY namespace] +inline namespace CPU_CAPABILITY { + +#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER) + +static inline void cvtfp8e4m3_fp32(const __m128i& a, __m512& o) { + // Zero Extend + __m512i x = _mm512_cvtepu8_epi32(a); + __m512i val = _mm512_and_epi32( + _mm512_slli_epi32(x, 24), _mm512_set1_epi32(0x7FFFFFFF)); // nonsign_val + __m512i mant = + _mm512_and_si512(x, _mm512_set1_epi32(0x07)); // mantissa = x & 0x07 + __m512i exp = _mm512_and_si512( + _mm512_srli_epi32(x, 3), + _mm512_set1_epi32(0x0F)); // exp = (x >> 3) & 0x0F + __m512i sign = + _mm512_and_si512(x, _mm512_set1_epi32(0x80)); // sign = x & 0x80 + __m512i _zeros = _mm512_setzero_si512(); + + // --- Step 1: Calculate the renorm_shift + __m512i renorm_shift = _zeros; + // Denorm case (exp == 0 && mant != 0) --- + __mmask16 denormal_mask = _mm512_cmpeq_epi32_mask(exp, _zeros) & + _mm512_cmpneq_epi32_mask(mant, _zeros); + if (denormal_mask) { + // An alternative solution is as what scalar did in + // pytorch/c10/util/Float8_e4m3fn.h To count the num of leading zeros, since + // here we know the unsigned denorm value has zero sign and exp which is 5 + // leading zeros, we need to count the leading zero of mant (3bit) which may + // done through table lookup for example: const uint8_t lz_table[8] = {3, 2, + // 1, 1, 0, 0, 0, 0}; num_leading_zero = lz_table[mant] + 5; + + __m512i _ones = _mm512_set1_epi32(1); + __m512i _twos = _mm512_set1_epi32(2); + __m512i _threes = _mm512_set1_epi32(3); + + // Default leading zero number for denorm value is 1 = 5 - 4 + __m512i denorm_renorm_shift = _ones; + // For mant 001, leading zero number is 3 = 7 -4 + __mmask16 leading_Zero_mask = _mm512_cmpeq_epi32_mask(mant, _ones); + denorm_renorm_shift = + _mm512_mask_mov_epi32(denorm_renorm_shift, leading_Zero_mask, _threes); + // For mant 010 and 011, leading zero number is 2 = 6 -4 + leading_Zero_mask = _mm512_cmpeq_epi32_mask(mant, _twos); + denorm_renorm_shift = + _mm512_mask_mov_epi32(denorm_renorm_shift, leading_Zero_mask, _twos); + leading_Zero_mask = _mm512_cmpeq_epi32_mask(mant, _threes); + denorm_renorm_shift = + _mm512_mask_mov_epi32(denorm_renorm_shift, leading_Zero_mask, _twos); + + renorm_shift = + _mm512_mask_mov_epi32(renorm_shift, denormal_mask, denorm_renorm_shift); + } + + // --- Step 2: calculate norm and denorm --- + __m512i norm_shifted = + _mm512_srli_epi32(_mm512_sllv_epi32(val, renorm_shift), 4); + // exponent bias adjustment: (0x78 - renorm_shift) << 23 + __m512i exp_bias = _mm512_slli_epi32( + _mm512_sub_epi32(_mm512_set1_epi32(0x78), renorm_shift), 23); + val = _mm512_add_epi32(norm_shifted, exp_bias); + + // --- Step 3: Nan case (exp == 0xF && mant == 0x07) --- + __mmask16 nan_mask = _mm512_cmpeq_epi32_mask(exp, _mm512_set1_epi32(0xF)) & + _mm512_cmpeq_epi32_mask(mant, _mm512_set1_epi32(0x07)); + if (nan_mask) { + const __m512i nan_values = _mm512_set1_epi32(0x7FC00000); + val = _mm512_mask_mov_epi32(val, nan_mask, nan_values); + } + + // --- Step 4: Zero case (exp == 0x00 && mant == 0x00) --- + __mmask16 zero_mask = _mm512_cmpeq_epi32_mask(exp, _zeros) & + _mm512_cmpeq_epi32_mask(mant, _zeros); + if (zero_mask) { + val = _mm512_mask_mov_epi32(val, zero_mask, _zeros); + } + + // --- Step 5: OR with sign (sign bit << 24 to get to bit 31) --- + val = _mm512_or_si512(val, _mm512_slli_epi32(sign, 24)); + + o = _mm512_castsi512_ps(val); +} + +static inline __m128i cvtfp32_fp8e4m3(const __m512& src) { + // cvt 16x32 from fp32 to fp8 e4m3 + const __m512i sign_mask = _mm512_set1_epi32(0x80000000); + const __m512i fp8_max = _mm512_set1_epi32(UINT32_C(1087) << 20); + const __m512i denorm_thresh = _mm512_set1_epi32(UINT32_C(121) << 23); + const __m512i denorm_mask = _mm512_set1_epi32(UINT32_C(141) << 23); + const __m512i bias_part1 = _mm512_set1_epi32((uint32_t)(7 - 127) << 23); + const __m512i rounding_bias = _mm512_set1_epi32(0x7FFFF); + __m512i f_bits = _mm512_castps_si512(src); + // Extract and save sign + __m512i sign = _mm512_and_epi32(f_bits, sign_mask); + f_bits = _mm512_xor_epi32(f_bits, sign); + + // Prepare result containers + __m512i result = _mm512_setzero_si512(); + + // Step 1: Handle case of overflow + // (f_bits >= fp8_max): set result = 0x7f + __mmask16 overflow_mask = _mm512_cmpge_epu32_mask(f_bits, fp8_max); + if (overflow_mask) { + result = _mm512_mask_set1_epi32(result, overflow_mask, 0x7f); + } + + // Step 2: Handle small numbers (denormals) + // Small numbers (f_bits < denorm_thresh) + __mmask16 denorm_thresh_mask = _mm512_cmplt_epu32_mask(f_bits, denorm_thresh); + + if (denorm_thresh_mask) { + __m512 small_input = _mm512_castsi512_ps(f_bits); + __m512 small_denorm = + _mm512_add_ps(small_input, _mm512_castsi512_ps(denorm_mask)); + __m512i small_denorm_bits = _mm512_castps_si512(small_denorm); + __m512i small_result = _mm512_sub_epi32(small_denorm_bits, denorm_mask); + result = _mm512_mask_mov_epi32(result, denorm_thresh_mask, small_result); + } + + // Step 3: Handle normal numbers + __mmask16 normal_mask = ~(overflow_mask | denorm_thresh_mask); + + if (normal_mask) { + // mant_odd = (f_bits >> 20) & 1 + __m512i mant_odd = + _mm512_and_epi32(_mm512_srli_epi32(f_bits, 20), _mm512_set1_epi32(1)); + // f_bits += bias_part1 + rounding_bias + __m512i rounded = _mm512_add_epi32(f_bits, bias_part1); + rounded = _mm512_add_epi32(rounded, rounding_bias); + // Add mant_odd + rounded = _mm512_add_epi32(rounded, mant_odd); + // Shift right by 20 bits + __m512i normal_result = _mm512_srli_epi32(rounded, 20); + result = _mm512_mask_mov_epi32(result, normal_mask, normal_result); + } + + // Merge back the sign + __m512i sign_shifted = _mm512_srli_epi32(sign, 24); + result = _mm512_or_epi32(result, sign_shifted); + + // Now result is 16 x 32-bit integers, but we only need 8-bit for each + __m512i packed = _mm512_and_si512(result, _mm512_set1_epi32(0xFF)); + + // Narrow 32-bit integers to 8-bit + return _mm512_cvtepi32_epi8(packed); +} + +static inline float fp8e4m3_to_fp32_scalar(uint8_t val) { + __m512i v = _mm512_set1_epi8(val); + __m128i v_128 = _mm512_castsi512_si128(v); + __m512 o; + cvtfp8e4m3_fp32(v_128, o); + return _mm512_cvtss_f32(o); +} + +static inline uint8_t fp32_to_fp8e4m3_scalar(float val) { + __m512 v = _mm512_set1_ps(val); + __m128i o = cvtfp32_fp8e4m3(v); + return static_cast(_mm_cvtsi128_si32(o)); +} + +static inline void cvtfp8e5m2_fp32(const __m128i& a, __m512& o) { + __m256i a_256 = _mm256_castsi128_si256(a); + __m512i a_512 = _mm512_cvtepu8_epi16(a_256); + a_512 = _mm512_slli_epi16(a_512, 8); + a_256 = _mm512_castsi512_si256(a_512); + cvtfp16_fp32(a_256, o); +} + +static inline __m128i cvtfp32_fp8e5m2(const __m512& src) { + constexpr uint32_t fp32_inf = UINT32_C(255) << 23; + constexpr uint32_t fp8_max = UINT32_C(143) << 23; + constexpr uint32_t denorm_mask = UINT32_C(134) << 23; + + // Cvt to bits + __m512i input_bits = _mm512_castps_si512(src); + __m512i result = _mm512_setzero_si512(); + + // Get the sign + __m512i sign = _mm512_and_si512(input_bits, _mm512_set1_epi32(0x80000000)); + + // Get the unsigned input + input_bits = _mm512_xor_si512(input_bits, sign); + + // Calculate the mask for inf, nan and denorm + __mmask16 greater_than_fp8_max = + _mm512_cmpge_epi32_mask(input_bits, _mm512_set1_epi32(fp8_max)); + __mmask16 greater_than_fp32_inf = + _mm512_cmpgt_epi32_mask(input_bits, _mm512_set1_epi32(fp32_inf)); + __mmask16 less_than_normal = _mm512_cmpgt_epi32_mask( + _mm512_set1_epi32((UINT32_C(113) << 23)), input_bits); + __m512i temp_bits_for_denorm = _mm512_setzero_si512(); + if (less_than_normal) { + __m512i denorm_mask_512i = _mm512_set1_epi32(denorm_mask); + temp_bits_for_denorm = _mm512_castps_si512(_mm512_add_ps( + _mm512_castsi512_ps(input_bits), + _mm512_castsi512_ps(denorm_mask_512i))); + temp_bits_for_denorm = + _mm512_sub_epi32(temp_bits_for_denorm, denorm_mask_512i); + } + + // Step 1: Norm Val + __m512i mant_odd_mask = + _mm512_and_epi32(_mm512_srli_epi32(input_bits, 21), _mm512_set1_epi32(1)); + input_bits = _mm512_add_epi32( + input_bits, _mm512_set1_epi32(((uint32_t)(15 - 127) << 23) + 0xFFFFF)); + input_bits = _mm512_add_epi32(input_bits, mant_odd_mask); + result = _mm512_srli_epi32(input_bits, 21); + + // Step 2: INF and NAN + if (greater_than_fp8_max) { + result = _mm512_mask_mov_epi32( + result, greater_than_fp8_max, _mm512_set1_epi8(0x7C)); + if (greater_than_fp32_inf) { + result = _mm512_mask_mov_epi32( + result, greater_than_fp32_inf, _mm512_set1_epi8(0x7F)); + } + } + + // Step 3: Denorm val + if (less_than_normal) { + result = + _mm512_mask_mov_epi32(result, less_than_normal, temp_bits_for_denorm); + } + + // Step 4: restore sign + result = _mm512_or_si512(result, _mm512_srli_epi32(sign, 24)); + + return _mm512_cvtepi32_epi8(result); +} + +static inline float fp8e5m2_to_fp32_scalar(uint8_t val) { + __m512i v = _mm512_set1_epi8(val); + __m128i v_128 = _mm512_castsi512_si128(v); + __m512 o; + cvtfp8e5m2_fp32(v_128, o); + return _mm512_cvtss_f32(o); +} + +static inline uint8_t fp32_to_fp8e5m2_scalar(float val) { + __m512 v = _mm512_set1_ps(val); + __m128i o = cvtfp32_fp8e5m2(v); + return static_cast(_mm_cvtsi128_si32(o)); +} + +template +class Vectorizedf8 { + static_assert( + std::integral_constant < bool, + std::is_same_v || std::is_same_v < T, + at::Float8_e5m2 >> ::value, + "Support only float8 e4m3."); + + private: + __m512i values; + template + Vectorized inline binary_compare(const VectorizedType& b, Op op) const { + __m512 a0, a1, a2, a3; + __m512 b0, b1, b2, b3; + __m512 o0, o1, o2, o3; + if constexpr (std::is_same_v) { + cvtfp8e4m3_fp32(_mm512_extracti32x4_epi32(values, 0), a0); + cvtfp8e4m3_fp32(_mm512_extracti32x4_epi32(b.values, 0), b0); + cvtfp8e4m3_fp32(_mm512_extracti32x4_epi32(values, 1), a1); + cvtfp8e4m3_fp32(_mm512_extracti32x4_epi32(b.values, 1), b1); + cvtfp8e4m3_fp32(_mm512_extracti32x4_epi32(values, 2), a2); + cvtfp8e4m3_fp32(_mm512_extracti32x4_epi32(b.values, 2), b2); + cvtfp8e4m3_fp32(_mm512_extracti32x4_epi32(values, 3), a3); + cvtfp8e4m3_fp32(_mm512_extracti32x4_epi32(b.values, 3), b3); + } else { + cvtfp8e5m2_fp32(_mm512_extracti32x4_epi32(values, 0), a0); + cvtfp8e5m2_fp32(_mm512_extracti32x4_epi32(b.values, 0), b0); + cvtfp8e5m2_fp32(_mm512_extracti32x4_epi32(values, 1), a1); + cvtfp8e5m2_fp32(_mm512_extracti32x4_epi32(b.values, 1), b1); + cvtfp8e5m2_fp32(_mm512_extracti32x4_epi32(values, 2), a2); + cvtfp8e5m2_fp32(_mm512_extracti32x4_epi32(b.values, 2), b2); + cvtfp8e5m2_fp32(_mm512_extracti32x4_epi32(values, 3), a3); + cvtfp8e5m2_fp32(_mm512_extracti32x4_epi32(b.values, 3), b3); + } + + o0 = op(a0, b0); + o1 = op(a1, b1); + o2 = op(a2, b2); + o3 = op(a3, b3); + __m128i o128_0, o128_1, o128_2, o128_3; + if constexpr (std::is_same_v) { + o128_0 = cvtfp32_fp8e4m3(o0); + o128_1 = cvtfp32_fp8e4m3(o1); + o128_2 = cvtfp32_fp8e4m3(o2); + o128_3 = cvtfp32_fp8e4m3(o3); + } else { + o128_0 = cvtfp32_fp8e5m2(o0); + o128_1 = cvtfp32_fp8e5m2(o1); + o128_2 = cvtfp32_fp8e5m2(o2); + o128_3 = cvtfp32_fp8e5m2(o3); + } + + __m512i result = _mm512_setzero_si512(); + result = _mm512_inserti32x4(result, o128_0, 0); + result = _mm512_inserti32x4(result, o128_1, 1); + result = _mm512_inserti32x4(result, o128_2, 2); + result = _mm512_inserti32x4(result, o128_3, 3); + + return result; + } + + public: + using value_type = uint8_t; + using size_type = int; + static constexpr size_type size() { + return 64; + } + Vectorizedf8() {} + Vectorizedf8(__m512i v) : values(v) {} + Vectorizedf8(T val) { + value_type uw = val.x; + values = _mm512_set1_epi8(uw); + } + operator __m512i() const { + return values; + } + T& operator[](int idx) = delete; + const T& operator[](int idx) const = delete; + static Vectorized loadu(const void* ptr, int16_t count = size()) { + if (count == size()) { + return _mm512_loadu_si512(reinterpret_cast(ptr)); + } else if (count == 16) { + // Fast path if only load element number of 16 + __m128i input_128 = + _mm_loadu_si128(reinterpret_cast(ptr)); + return _mm512_castsi128_si512(input_128); + } else { + __mmask64 mask = (1ULL << count) - 1; + return _mm512_maskz_loadu_epi8(mask, ptr); + } + } + void store(void* ptr, int count = size()) const { + if (count == size()) { + _mm512_storeu_si512(reinterpret_cast<__m512i*>(ptr), values); + } else if (count > 0) { + if (count == 16) { + // Fast path if only store element number of 16 + _mm_storeu_si128( + reinterpret_cast<__m128i*>(ptr), _mm512_castsi512_si128(values)); + } else { + __mmask64 mask = (1ULL << count) - 1; + _mm512_mask_storeu_epi8(ptr, mask, values); + } + } + } + + Vectorized abs() const { + return _mm512_andnot_si512(_mm512_set1_epi8(0x80), values); + } + + Vectorized inline operator==(const Vectorizedf8& other) const { + return binary_compare(other, [](__m512 x, __m512 y) { + auto zero_vec = _mm512_set1_epi32(0); + auto cmp = _mm512_cmp_ps_mask(x, y, _CMP_EQ_OQ); + return _mm512_castsi512_ps( + _mm512_mask_set1_epi32(zero_vec, cmp, 0xFFFFFFFF)); + }); + } + + Vectorized inline operator!=(const Vectorizedf8& other) const { + return binary_compare(other, [](__m512 x, __m512 y) { + auto zero_vec = _mm512_set1_epi32(0); + auto cmp = _mm512_cmp_ps_mask(x, y, _CMP_NEQ_UQ); + return _mm512_castsi512_ps( + _mm512_mask_set1_epi32(zero_vec, cmp, 0xFFFFFFFF)); + }); + } + + Vectorized inline operator>(const Vectorizedf8& other) const { + return binary_compare(other, [](__m512 x, __m512 y) { + auto zero_vec = _mm512_set1_epi32(0); + auto cmp = _mm512_cmp_ps_mask(x, y, _CMP_GT_OQ); + return _mm512_castsi512_ps( + _mm512_mask_set1_epi32(zero_vec, cmp, 0xFFFFFFFF)); + }); + } + + Vectorized inline operator>=(const Vectorizedf8& other) const { + return binary_compare(other, [](__m512 x, __m512 y) { + auto zero_vec = _mm512_set1_epi32(0); + auto cmp = _mm512_cmp_ps_mask(x, y, _CMP_GE_OQ); + return _mm512_castsi512_ps( + _mm512_mask_set1_epi32(zero_vec, cmp, 0xFFFFFFFF)); + }); + } + + Vectorized inline operator<(const Vectorizedf8& other) const { + return binary_compare(other, [](__m512 x, __m512 y) { + auto zero_vec = _mm512_set1_epi32(0); + auto cmp = _mm512_cmp_ps_mask(x, y, _CMP_LT_OQ); + return _mm512_castsi512_ps( + _mm512_mask_set1_epi32(zero_vec, cmp, 0xFFFFFFFF)); + }); + } + + Vectorized inline operator<=(const Vectorizedf8& other) const { + return binary_compare(other, [](__m512 x, __m512 y) { + auto zero_vec = _mm512_set1_epi32(0); + auto cmp = _mm512_cmp_ps_mask(x, y, _CMP_LE_OQ); + return _mm512_castsi512_ps( + _mm512_mask_set1_epi32(zero_vec, cmp, 0xFFFFFFFF)); + }); + } +}; + +template <> +class Vectorized : public Vectorizedf8 { + public: + using Vectorizedf8::Vectorizedf8; + + using value_type = Float8_e4m3fn; + + Vectorized eq(const Vectorized& other) const; + Vectorized ne(const Vectorized& other) const; + Vectorized gt(const Vectorized& other) const; + Vectorized ge(const Vectorized& other) const; + Vectorized lt(const Vectorized& other) const; + Vectorized le(const Vectorized& other) const; +}; + +template < + typename T, + typename Op, + std::enable_if_t< + std::is_same_v || + std::is_same_v, + int> = 0> +static inline Vectorized binary_fp8_op_as_fp32( + const Vectorized& a, + const Vectorized& b, + Op op) { + __m512 a0, a1, a2, a3; + __m512 b0, b1, b2, b3; + __m512 o0, o1, o2, o3; + if constexpr (std::is_same_v) { + cvtfp8e4m3_fp32(_mm512_extracti32x4_epi32(a, 0), a0); + cvtfp8e4m3_fp32(_mm512_extracti32x4_epi32(b, 0), b0); + cvtfp8e4m3_fp32(_mm512_extracti32x4_epi32(a, 1), a1); + cvtfp8e4m3_fp32(_mm512_extracti32x4_epi32(b, 1), b1); + cvtfp8e4m3_fp32(_mm512_extracti32x4_epi32(a, 2), a2); + cvtfp8e4m3_fp32(_mm512_extracti32x4_epi32(b, 2), b2); + cvtfp8e4m3_fp32(_mm512_extracti32x4_epi32(a, 3), a3); + cvtfp8e4m3_fp32(_mm512_extracti32x4_epi32(b, 3), b3); + } else { + cvtfp8e5m2_fp32(_mm512_extracti32x4_epi32(a, 0), a0); + cvtfp8e5m2_fp32(_mm512_extracti32x4_epi32(b, 0), b0); + cvtfp8e5m2_fp32(_mm512_extracti32x4_epi32(a, 1), a1); + cvtfp8e5m2_fp32(_mm512_extracti32x4_epi32(b, 1), b1); + cvtfp8e5m2_fp32(_mm512_extracti32x4_epi32(a, 2), a2); + cvtfp8e5m2_fp32(_mm512_extracti32x4_epi32(b, 2), b2); + cvtfp8e5m2_fp32(_mm512_extracti32x4_epi32(a, 3), a3); + cvtfp8e5m2_fp32(_mm512_extracti32x4_epi32(b, 3), b3); + } + o0 = op(a0, b0); + o1 = op(a1, b1); + o2 = op(a2, b2); + o3 = op(a3, b3); + + __m128i o128_0, o128_1, o128_2, o128_3; + if constexpr (std::is_same_v) { + o128_0 = cvtfp32_fp8e4m3(o0); + o128_1 = cvtfp32_fp8e4m3(o1); + o128_2 = cvtfp32_fp8e4m3(o2); + o128_3 = cvtfp32_fp8e4m3(o3); + } else { + o128_0 = cvtfp32_fp8e5m2(o0); + o128_1 = cvtfp32_fp8e5m2(o1); + o128_2 = cvtfp32_fp8e5m2(o2); + o128_3 = cvtfp32_fp8e5m2(o3); + } + + __m512i result = _mm512_setzero_si512(); + result = _mm512_inserti32x4(result, o128_0, 0); + result = _mm512_inserti32x4(result, o128_1, 1); + result = _mm512_inserti32x4(result, o128_2, 2); + result = _mm512_inserti32x4(result, o128_3, 3); + + return result; +} + +// Refer to +// https://github.com/pytorch/pytorch/pull/153364#discussion_r2086509353 FP8 +, +// -, *, /, planed to be deleted in the future and here is just to make compiler +// happy +Vectorized inline operator+( + const Vectorized& a, + const Vectorized& b) { + return binary_fp8_op_as_fp32(a, b, [](const __m512& x, const __m512& y) { + return _mm512_add_ps(x, y); + }); +} + +Vectorized inline operator-( + const Vectorized& a, + const Vectorized& b) { + return binary_fp8_op_as_fp32(a, b, [](const __m512& x, const __m512& y) { + return _mm512_sub_ps(x, y); + }); +} + +Vectorized inline operator*( + const Vectorized& a, + const Vectorized& b) { + return binary_fp8_op_as_fp32(a, b, [](const __m512& x, const __m512& y) { + return _mm512_mul_ps(x, y); + }); +} + +Vectorized inline operator/( + const Vectorized& a, + const Vectorized& b) { + return binary_fp8_op_as_fp32(a, b, [](const __m512& x, const __m512& y) { + return _mm512_div_ps(x, y); + }); +} + +Vectorized inline operator&( + const Vectorized& a, + const Vectorized& b) { + return _mm512_and_si512(a, b); +} + +inline Vectorized Vectorized::eq( + const Vectorized& other) const { + return (*this == other) & Vectorized(1.0f); +} + +inline Vectorized Vectorized::ne( + const Vectorized& other) const { + return (*this == other) & Vectorized(1.0f); +} + +inline Vectorized Vectorized::gt( + const Vectorized& other) const { + return (*this > other) & Vectorized(1.0f); +} + +inline Vectorized Vectorized::ge( + const Vectorized& other) const { + return (*this >= other) & Vectorized(1.0f); +} + +inline Vectorized Vectorized::lt( + const Vectorized& other) const { + return (*this < other) & Vectorized(1.0f); +} + +inline Vectorized Vectorized::le( + const Vectorized& other) const { + return (*this <= other) & Vectorized(1.0f); +} + +template <> +class Vectorized : public Vectorizedf8 { + public: + using Vectorizedf8::Vectorizedf8; + + using value_type = Float8_e5m2; + + Vectorized eq(const Vectorized& other) const; + Vectorized ne(const Vectorized& other) const; + Vectorized gt(const Vectorized& other) const; + Vectorized ge(const Vectorized& other) const; + Vectorized lt(const Vectorized& other) const; + Vectorized le(const Vectorized& other) const; +}; + +// Refer to +// https://github.com/pytorch/pytorch/pull/153364#discussion_r2086509353 FP8 +, +// -, *, /, planed to be deleted in the future and here is just to make compiler +// happy +Vectorized inline operator+( + const Vectorized& a, + const Vectorized& b) { + return binary_fp8_op_as_fp32(a, b, [](const __m512& x, const __m512& y) { + return _mm512_add_ps(x, y); + }); +} + +Vectorized inline operator-( + const Vectorized& a, + const Vectorized& b) { + return binary_fp8_op_as_fp32(a, b, [](const __m512& x, const __m512& y) { + return _mm512_sub_ps(x, y); + }); +} + +Vectorized inline operator*( + const Vectorized& a, + const Vectorized& b) { + return binary_fp8_op_as_fp32(a, b, [](const __m512& x, const __m512& y) { + return _mm512_mul_ps(x, y); + }); +} + +Vectorized inline operator/( + const Vectorized& a, + const Vectorized& b) { + return binary_fp8_op_as_fp32(a, b, [](const __m512& x, const __m512& y) { + return _mm512_div_ps(x, y); + }); +} + +Vectorized inline operator&( + const Vectorized& a, + const Vectorized& b) { + return _mm512_and_si512(a, b); +} + +inline Vectorized Vectorized::eq( + const Vectorized& other) const { + return (*this == other) & Vectorized(1.0f); +} + +inline Vectorized Vectorized::ne( + const Vectorized& other) const { + return (*this == other) & Vectorized(1.0f); +} + +inline Vectorized Vectorized::gt( + const Vectorized& other) const { + return (*this > other) & Vectorized(1.0f); +} + +inline Vectorized Vectorized::ge( + const Vectorized& other) const { + return (*this >= other) & Vectorized(1.0f); +} + +inline Vectorized Vectorized::lt( + const Vectorized& other) const { + return (*this < other) & Vectorized(1.0f); +} + +inline Vectorized Vectorized::le( + const Vectorized& other) const { + return (*this <= other) & Vectorized(1.0f); +} + +#endif + +} // namespace CPU_CAPABILITY +} // namespace at::vec diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_int.h b/aten/src/ATen/cpu/vec/vec512/vec512_int.h index aa19977e332f..12f45990cf8c 100644 --- a/aten/src/ATen/cpu/vec/vec512/vec512_int.h +++ b/aten/src/ATen/cpu/vec/vec512/vec512_int.h @@ -8,21 +8,35 @@ #include #include +<<<<<<< HEAD +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) namespace at::vec { inline namespace CPU_CAPABILITY { #ifdef CPU_CAPABILITY_AVX512 struct Vectorizedi { +<<<<<<< HEAD protected: __m512i values; static constexpr __m512i zero_vector {0, 0, 0, 0, 0, 0, 0, 0}; +======= + protected: + __m512i values; + static constexpr __m512i zero_vector{0, 0, 0, 0, 0, 0, 0, 0}; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) static inline __m512i invert(const __m512i& v) { const auto ones = _mm512_set1_epi64(-1); return _mm512_xor_si512(ones, v); } +<<<<<<< HEAD public: +======= + + public: +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) Vectorizedi() {} Vectorizedi(__m512i v) : values(v) {} operator __m512i() const { @@ -32,17 +46,32 @@ struct Vectorizedi { #else +<<<<<<< HEAD struct Vectorizedi {}; // dummy definition to make Vectorizedi always defined +======= +struct Vectorizedi {}; // dummy definition to make Vectorizedi always defined +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #endif // CPU_CAPABILITY_AVX512 #ifdef CPU_CAPABILITY_AVX512 template <> +<<<<<<< HEAD class Vectorized : public Vectorizedi { private: static const Vectorized ones; public: +======= +struct is_vec_specialized_for : std::bool_constant {}; + +template <> +class Vectorized : public Vectorizedi { + private: + static const Vectorized ones; + + public: +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) using value_type = int64_t; using size_type = int; static constexpr size_type size() { @@ -50,6 +79,7 @@ class Vectorized : public Vectorizedi { } using Vectorizedi::Vectorizedi; Vectorized() {} +<<<<<<< HEAD Vectorized(int64_t v) { values = _mm512_set1_epi64(v); } Vectorized(int64_t val1, int64_t val2, int64_t val3, int64_t val4, int64_t val5, int64_t val6, int64_t val7, int64_t val8) { @@ -62,17 +92,63 @@ class Vectorized : public Vectorizedi { } static Vectorized blendv(const Vectorized& a, const Vectorized& b, const Vectorized& mask) { +======= + Vectorized(int64_t v) { + values = _mm512_set1_epi64(v); + } + Vectorized( + int64_t val1, + int64_t val2, + int64_t val3, + int64_t val4, + int64_t val5, + int64_t val6, + int64_t val7, + int64_t val8) { + values = _mm512_setr_epi64(val1, val2, val3, val4, val5, val6, val7, val8); + } + template + static Vectorized blend( + Vectorized a, + Vectorized b) { + return _mm512_mask_blend_epi64(mask, a.values, b.values); + } + static Vectorized blendv( + const Vectorized& a, + const Vectorized& b, + const Vectorized& mask) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) auto msb_one = _mm512_set1_epi64(0xFFFFFFFFFFFFFFFF); auto mask_ = _mm512_cmp_epi64_mask(mask, msb_one, _MM_CMPINT_EQ); return _mm512_mask_blend_epi64(mask_, a.values, b.values); } template +<<<<<<< HEAD static Vectorized arange(int64_t base = 0, step_t step = static_cast(1)) { return Vectorized(base, base + step, base + 2 * step, base + 3 * step, base + 4 * step, base + 5 * step, base + 6 * step, base + 7 * step); } static Vectorized set(Vectorized a, Vectorized b, int64_t count = size()) { +======= + static Vectorized arange( + int64_t base = 0, + step_t step = static_cast(1)) { + return Vectorized( + base, + base + step, + base + 2 * step, + base + 3 * step, + base + 4 * step, + base + 5 * step, + base + 6 * step, + base + 7 * step); + } + static Vectorized set( + Vectorized a, + Vectorized b, + int64_t count = size()) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) switch (count) { case 0: return a; @@ -114,11 +190,20 @@ class Vectorized : public Vectorizedi { _mm512_mask_storeu_epi64(ptr, mask, values); } } +<<<<<<< HEAD const int64_t& operator[](int idx) const = delete; int64_t& operator[](int idx) = delete; Vectorized abs() const { auto is_larger_mask = _mm512_cmpgt_epi64_mask(zero_vector, values); auto is_larger = _mm512_mask_set1_epi64(zero_vector, is_larger_mask, 0xFFFFFFFFFFFFFFFF); +======= + const int64_t& operator[](int idx) const = delete; + int64_t& operator[](int idx) = delete; + Vectorized abs() const { + auto is_larger_mask = _mm512_cmpgt_epi64_mask(zero_vector, values); + auto is_larger = + _mm512_mask_set1_epi64(zero_vector, is_larger_mask, 0xFFFFFFFFFFFFFFFF); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) auto inverse = _mm512_xor_si512(values, is_larger); return _mm512_sub_epi64(inverse, is_larger); } @@ -166,17 +251,29 @@ class Vectorized : public Vectorizedi { }; template <> +<<<<<<< HEAD class Vectorized : public Vectorizedi { private: static constexpr __m512i zero_vector {0, 0, 0, 0, 0, 0, 0, 0}; static const Vectorized ones; public: +======= +struct is_vec_specialized_for : std::bool_constant {}; +template <> +class Vectorized : public Vectorizedi { + private: + static constexpr __m512i zero_vector{0, 0, 0, 0, 0, 0, 0, 0}; + static const Vectorized ones; + + public: +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) using value_type = int32_t; static constexpr int size() { return 16; } using Vectorizedi::Vectorizedi; Vectorized() {} +<<<<<<< HEAD Vectorized(int32_t v) { values = _mm512_set1_epi32(v); } Vectorized(int32_t val1, int32_t val2, int32_t val3, int32_t val4, int32_t val5, int32_t val6, int32_t val7, int32_t val8, @@ -191,11 +288,62 @@ class Vectorized : public Vectorizedi { } static Vectorized blendv(const Vectorized& a, const Vectorized& b, const Vectorized& mask) { +======= + Vectorized(int32_t v) { + values = _mm512_set1_epi32(v); + } + Vectorized( + int32_t val1, + int32_t val2, + int32_t val3, + int32_t val4, + int32_t val5, + int32_t val6, + int32_t val7, + int32_t val8, + int32_t val9, + int32_t val10, + int32_t val11, + int32_t val12, + int32_t val13, + int32_t val14, + int32_t val15, + int32_t val16) { + values = _mm512_setr_epi32( + val1, + val2, + val3, + val4, + val5, + val6, + val7, + val8, + val9, + val10, + val11, + val12, + val13, + val14, + val15, + val16); + } + template + static Vectorized blend( + Vectorized a, + Vectorized b) { + return _mm512_mask_blend_epi32(mask, a.values, b.values); + } + static Vectorized blendv( + const Vectorized& a, + const Vectorized& b, + const Vectorized& mask) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) auto msb_one = _mm512_set1_epi32(0xFFFFFFFF); auto mask_ = _mm512_cmp_epi32_mask(mask, msb_one, _MM_CMPINT_EQ); return _mm512_mask_blend_epi32(mask_, a.values, b.values); } template +<<<<<<< HEAD static Vectorized arange(int32_t base = 0, step_t step = static_cast(1)) { return Vectorized( base, base + step, base + 2 * step, base + 3 * step, @@ -205,6 +353,33 @@ class Vectorized : public Vectorizedi { } static Vectorized set(Vectorized a, Vectorized b, int32_t count = size()) { +======= + static Vectorized arange( + int32_t base = 0, + step_t step = static_cast(1)) { + return Vectorized( + base, + base + step, + base + 2 * step, + base + 3 * step, + base + 4 * step, + base + 5 * step, + base + 6 * step, + base + 7 * step, + base + 8 * step, + base + 9 * step, + base + 10 * step, + base + 11 * step, + base + 12 * step, + base + 13 * step, + base + 14 * step, + base + 15 * step); + } + static Vectorized set( + Vectorized a, + Vectorized b, + int32_t count = size()) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) switch (count) { case 0: return a; @@ -262,8 +437,13 @@ class Vectorized : public Vectorizedi { _mm512_mask_storeu_epi32(ptr, mask, values); } } +<<<<<<< HEAD const int32_t& operator[](int idx) const = delete; int32_t& operator[](int idx) = delete; +======= + const int32_t& operator[](int idx) const = delete; + int32_t& operator[](int idx) = delete; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) Vectorized abs() const { return _mm512_abs_epi32(values); } @@ -316,6 +496,7 @@ class Vectorized : public Vectorizedi { }; template <> +<<<<<<< HEAD inline void convert(const int32_t *src, float *dst, int64_t n) { int64_t i; // int32_t and float have same size @@ -324,11 +505,27 @@ inline void convert(const int32_t *src, float *dst, int64_t n) { #endif for (i = 0; i <= (n - Vectorized::size()); i += Vectorized::size()) { auto input_vec = _mm512_loadu_si512(reinterpret_cast(src + i)); +======= +inline void convert(const int32_t* src, float* dst, int64_t n) { + int64_t i; + // int32_t and float have same size +#ifndef _MSC_VER +#pragma unroll +#endif + for (i = 0; i <= (n - Vectorized::size()); + i += Vectorized::size()) { + auto input_vec = + _mm512_loadu_si512(reinterpret_cast(src + i)); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) auto output_vec = _mm512_cvtepi32_ps(input_vec); _mm512_storeu_ps(reinterpret_cast(dst + i), output_vec); } #ifndef _MSC_VER +<<<<<<< HEAD # pragma unroll +======= +#pragma unroll +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #endif for (; i < n; i++) { dst[i] = static_cast(src[i]); @@ -336,6 +533,7 @@ inline void convert(const int32_t *src, float *dst, int64_t n) { } template <> +<<<<<<< HEAD inline void convert(const int32_t *src, double *dst, int64_t n) { int64_t i; // int32_t has half the size of double @@ -344,11 +542,27 @@ inline void convert(const int32_t *src, double *dst, int64_t n) { #endif for (i = 0; i <= (n - Vectorized::size()); i += Vectorized::size()) { auto input_256_vec = _mm256_loadu_si256(reinterpret_cast(src + i)); +======= +inline void convert(const int32_t* src, double* dst, int64_t n) { + int64_t i; + // int32_t has half the size of double +#ifndef _MSC_VER +#pragma unroll +#endif + for (i = 0; i <= (n - Vectorized::size()); + i += Vectorized::size()) { + auto input_256_vec = + _mm256_loadu_si256(reinterpret_cast(src + i)); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) auto output_vec = _mm512_cvtepi32_pd(input_256_vec); _mm512_storeu_pd(reinterpret_cast(dst + i), output_vec); } #ifndef _MSC_VER +<<<<<<< HEAD # pragma unroll +======= +#pragma unroll +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #endif for (; i < n; i++) { dst[i] = static_cast(src[i]); @@ -356,17 +570,30 @@ inline void convert(const int32_t *src, double *dst, int64_t n) { } template <> +<<<<<<< HEAD class Vectorized : public Vectorizedi { private: static const Vectorized ones; static constexpr __m512i zero_vector {0, 0, 0, 0, 0, 0, 0, 0}; public: +======= +struct is_vec_specialized_for : std::bool_constant {}; + +template <> +class Vectorized : public Vectorizedi { + private: + static const Vectorized ones; + static constexpr __m512i zero_vector{0, 0, 0, 0, 0, 0, 0, 0}; + + public: +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) using value_type = int16_t; static constexpr int size() { return 32; } using Vectorizedi::Vectorizedi; Vectorized() {} +<<<<<<< HEAD Vectorized(int16_t v) { values = _mm512_set1_epi16(v); } Vectorized(int16_t val1, int16_t val2, int16_t val3, int16_t val4, int16_t val5, int16_t val6, int16_t val7, int16_t val8, @@ -387,11 +614,94 @@ class Vectorized : public Vectorizedi { } static Vectorized blendv(const Vectorized& a, const Vectorized& b, const Vectorized& mask) { +======= + Vectorized(int16_t v) { + values = _mm512_set1_epi16(v); + } + Vectorized( + int16_t val1, + int16_t val2, + int16_t val3, + int16_t val4, + int16_t val5, + int16_t val6, + int16_t val7, + int16_t val8, + int16_t val9, + int16_t val10, + int16_t val11, + int16_t val12, + int16_t val13, + int16_t val14, + int16_t val15, + int16_t val16, + int16_t val17, + int16_t val18, + int16_t val19, + int16_t val20, + int16_t val21, + int16_t val22, + int16_t val23, + int16_t val24, + int16_t val25, + int16_t val26, + int16_t val27, + int16_t val28, + int16_t val29, + int16_t val30, + int16_t val31, + int16_t val32) { + values = _mm512_set_epi16( + val32, + val31, + val30, + val29, + val28, + val27, + val26, + val25, + val24, + val23, + val22, + val21, + val20, + val19, + val18, + val17, + val16, + val15, + val14, + val13, + val12, + val11, + val10, + val9, + val8, + val7, + val6, + val5, + val4, + val3, + val2, + val1); + } + template + static Vectorized blend( + Vectorized a, + Vectorized b) { + return _mm512_mask_blend_epi16(mask, a.values, b.values); + } + static Vectorized blendv( + const Vectorized& a, + const Vectorized& b, + const Vectorized& mask) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) auto msb_one = _mm512_set1_epi16(0xFFFF); auto mask_ = _mm512_cmp_epi16_mask(mask, msb_one, _MM_CMPINT_EQ); return _mm512_mask_blend_epi16(mask_, a.values, b.values); } template +<<<<<<< HEAD static Vectorized arange(int16_t base = 0, step_t step = static_cast(1)) { return Vectorized( base, base + step, base + 2 * step, base + 3 * step, @@ -406,6 +716,49 @@ class Vectorized : public Vectorizedi { } static Vectorized set(Vectorized a, Vectorized b, int16_t count = size()) { +======= + static Vectorized arange( + int16_t base = 0, + step_t step = static_cast(1)) { + return Vectorized( + base, + base + step, + base + 2 * step, + base + 3 * step, + base + 4 * step, + base + 5 * step, + base + 6 * step, + base + 7 * step, + base + 8 * step, + base + 9 * step, + base + 10 * step, + base + 11 * step, + base + 12 * step, + base + 13 * step, + base + 14 * step, + base + 15 * step, + base + 16 * step, + base + 17 * step, + base + 18 * step, + base + 19 * step, + base + 20 * step, + base + 21 * step, + base + 22 * step, + base + 23 * step, + base + 24 * step, + base + 25 * step, + base + 26 * step, + base + 27 * step, + base + 28 * step, + base + 29 * step, + base + 30 * step, + base + 31 * step); + } + static Vectorized set( + Vectorized a, + Vectorized b, + int16_t count = size()) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) switch (count) { case 0: return a; @@ -495,8 +848,13 @@ class Vectorized : public Vectorizedi { _mm512_mask_storeu_epi16(ptr, mask, values); } } +<<<<<<< HEAD const int16_t& operator[](int idx) const = delete; int16_t& operator[](int idx) = delete; +======= + const int16_t& operator[](int idx) const = delete; + int16_t& operator[](int idx) = delete; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) Vectorized abs() const { return _mm512_abs_epi16(values); } @@ -546,18 +904,30 @@ class Vectorized : public Vectorizedi { template class Vectorized8 : public Vectorizedi { static_assert( +<<<<<<< HEAD std::is_same_v || std::is_same_v, "Only int8_t/uint8_t are supported"); protected: static constexpr __m512i zero_vector {0, 0, 0, 0, 0, 0, 0, 0}; static const Vectorized ones; public: +======= + std::is_same_v || std::is_same_v, + "Only int8_t/uint8_t are supported"); + + protected: + static constexpr __m512i zero_vector{0, 0, 0, 0, 0, 0, 0, 0}; + static const Vectorized ones; + + public: +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) using value_type = T; static constexpr int size() { return 64; } using Vectorizedi::Vectorizedi; Vectorized8() {} +<<<<<<< HEAD Vectorized8(T v) { values = _mm512_set1_epi8(v); } Vectorized8(T val1, T val2, T val3, T val4, T val5, T val6, T val7, T val8, @@ -583,12 +953,148 @@ class Vectorized8 : public Vectorizedi { val24, val23, val22, val21, val20, val19, val18, val17, val16, val15, val14, val13, val12, val11, val10, val9, val8, val7, val6, val5, val4, val3, val2, val1); +======= + Vectorized8(T v) { + values = _mm512_set1_epi8(v); + } + Vectorized8( + T val1, + T val2, + T val3, + T val4, + T val5, + T val6, + T val7, + T val8, + T val9, + T val10, + T val11, + T val12, + T val13, + T val14, + T val15, + T val16, + T val17, + T val18, + T val19, + T val20, + T val21, + T val22, + T val23, + T val24, + T val25, + T val26, + T val27, + T val28, + T val29, + T val30, + T val31, + T val32, + T val33, + T val34, + T val35, + T val36, + T val37, + T val38, + T val39, + T val40, + T val41, + T val42, + T val43, + T val44, + T val45, + T val46, + T val47, + T val48, + T val49, + T val50, + T val51, + T val52, + T val53, + T val54, + T val55, + T val56, + T val57, + T val58, + T val59, + T val60, + T val61, + T val62, + T val63, + T val64) { + values = _mm512_set_epi8( + val64, + val63, + val62, + val61, + val60, + val59, + val58, + val57, + val56, + val55, + val54, + val53, + val52, + val51, + val50, + val49, + val48, + val47, + val46, + val45, + val44, + val43, + val42, + val41, + val40, + val39, + val38, + val37, + val36, + val35, + val34, + val33, + val32, + val31, + val30, + val29, + val28, + val27, + val26, + val25, + val24, + val23, + val22, + val21, + val20, + val19, + val18, + val17, + val16, + val15, + val14, + val13, + val12, + val11, + val10, + val9, + val8, + val7, + val6, + val5, + val4, + val3, + val2, + val1); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } template static Vectorized blend(Vectorized a, Vectorized b) { return _mm512_mask_blend_epi8(mask, a.values, b.values); } template +<<<<<<< HEAD static Vectorized arange(T base = 0, step_t step = static_cast(1)) { return Vectorized( base, base + step, base + 2 * step, base + 3 * step, @@ -610,6 +1116,78 @@ class Vectorized8 : public Vectorizedi { } static Vectorized set(Vectorized a, Vectorized b, T count = size()) { +======= + static Vectorized arange( + T base = 0, + step_t step = static_cast(1)) { + return Vectorized( + base, + base + step, + base + 2 * step, + base + 3 * step, + base + 4 * step, + base + 5 * step, + base + 6 * step, + base + 7 * step, + base + 8 * step, + base + 9 * step, + base + 10 * step, + base + 11 * step, + base + 12 * step, + base + 13 * step, + base + 14 * step, + base + 15 * step, + base + 16 * step, + base + 17 * step, + base + 18 * step, + base + 19 * step, + base + 20 * step, + base + 21 * step, + base + 22 * step, + base + 23 * step, + base + 24 * step, + base + 25 * step, + base + 26 * step, + base + 27 * step, + base + 28 * step, + base + 29 * step, + base + 30 * step, + base + 31 * step, + base + 32 * step, + base + 33 * step, + base + 34 * step, + base + 35 * step, + base + 36 * step, + base + 37 * step, + base + 38 * step, + base + 39 * step, + base + 40 * step, + base + 41 * step, + base + 42 * step, + base + 43 * step, + base + 44 * step, + base + 45 * step, + base + 46 * step, + base + 47 * step, + base + 48 * step, + base + 49 * step, + base + 50 * step, + base + 51 * step, + base + 52 * step, + base + 53 * step, + base + 54 * step, + base + 55 * step, + base + 56 * step, + base + 57 * step, + base + 58 * step, + base + 59 * step, + base + 60 * step, + base + 61 * step, + base + 62 * step, + base + 63 * step); + } + static Vectorized set(Vectorized a, Vectorized b, T count = size()) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) switch (count) { case 0: return a; @@ -746,6 +1324,7 @@ class Vectorized8 : public Vectorizedi { return _mm512_loadu_si512(reinterpret_cast(ptr)); } static Vectorized loadu_one_fourth(const void* ptr) { +<<<<<<< HEAD // Fast path if only load element number of 16. // Note: We didn't merge it as fast path of loadu(const void* ptr, T count), // Because loadu(const void* ptr, T count) requires zero initialization for upper 384 bits. @@ -754,6 +1333,17 @@ class Vectorized8 : public Vectorizedi { // since gcc 9.3 doesn't support it now. __m128i input_128 = _mm_loadu_si128(reinterpret_cast(ptr)); return _mm512_castsi128_si512(input_128); +======= + // Fast path if only load element number of 16. + // Note: We didn't merge it as fast path of loadu(const void* ptr, T count), + // Because loadu(const void* ptr, T count) requires zero initialization for + // upper 384 bits. However, by using _mm512_castsi128_si512, the upper 384 + // bits of the result are undefined. + // TODO We can use _mm512_zextsi128_si512 in the furture, + // since gcc 9.3 doesn't support it now. + __m128i input_128 = _mm_loadu_si128(reinterpret_cast(ptr)); + return _mm512_castsi128_si512(input_128); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } static Vectorized loadu(const void* ptr, T count) { if (count == size()) { @@ -775,16 +1365,25 @@ class Vectorized8 : public Vectorizedi { if (count == 16) { // Fast path if only store element number of 16 _mm_storeu_si128( +<<<<<<< HEAD reinterpret_cast<__m128i*>(ptr), _mm512_castsi512_si128(values)); +======= + reinterpret_cast<__m128i*>(ptr), _mm512_castsi512_si128(values)); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } else { __mmask64 mask = (1ULL << count) - 1; _mm512_mask_storeu_epi8(ptr, mask, values); } } } +<<<<<<< HEAD const T& operator[](int idx) const = delete; T& operator[](int idx) = delete; +======= + const T& operator[](int idx) const = delete; + T& operator[](int idx) = delete; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) Vectorized real() const { return *this; } @@ -796,6 +1395,7 @@ class Vectorized8 : public Vectorizedi { } }; +<<<<<<< HEAD template<> class Vectorized: public Vectorized8 { public: @@ -803,6 +1403,20 @@ class Vectorized: public Vectorized8 { static Vectorized blendv(const Vectorized& a, const Vectorized& b, const Vectorized& mask) { +======= +template <> +struct is_vec_specialized_for : std::bool_constant {}; + +template <> +class Vectorized : public Vectorized8 { + public: + using Vectorized8::Vectorized8; + + static Vectorized blendv( + const Vectorized& a, + const Vectorized& b, + const Vectorized& mask) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) auto msb_one = _mm512_set1_epi8(0xFF); auto mask_ = _mm512_cmp_epi8_mask(mask, msb_one, _MM_CMPINT_EQ); return _mm512_mask_blend_epi8(mask_, a.values, b.values); @@ -845,6 +1459,7 @@ class Vectorized: public Vectorized8 { Vectorized le(const Vectorized& other) const; }; +<<<<<<< HEAD template<> class Vectorized: public Vectorized8 { public: @@ -852,6 +1467,20 @@ class Vectorized: public Vectorized8 { static Vectorized blendv(const Vectorized& a, const Vectorized& b, const Vectorized& mask) { +======= +template <> +struct is_vec_specialized_for : std::bool_constant {}; + +template <> +class Vectorized : public Vectorized8 { + public: + using Vectorized8::Vectorized8; + + static Vectorized blendv( + const Vectorized& a, + const Vectorized& b, + const Vectorized& mask) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) auto msb_one = _mm512_set1_epi8(0xFF); auto mask_ = _mm512_cmp_epu8_mask(mask, msb_one, _MM_CMPINT_EQ); return _mm512_mask_blend_epi8(mask_, a.values, b.values); @@ -895,52 +1524,112 @@ class Vectorized: public Vectorized8 { }; template <> +<<<<<<< HEAD Vectorized inline operator+(const Vectorized& a, const Vectorized& b) { +======= +Vectorized inline operator+( + const Vectorized& a, + const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return _mm512_add_epi64(a, b); } template <> +<<<<<<< HEAD Vectorized inline operator+(const Vectorized& a, const Vectorized& b) { +======= +Vectorized inline operator+( + const Vectorized& a, + const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return _mm512_add_epi32(a, b); } template <> +<<<<<<< HEAD Vectorized inline operator+(const Vectorized& a, const Vectorized& b) { +======= +Vectorized inline operator+( + const Vectorized& a, + const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return _mm512_add_epi16(a, b); } template <> +<<<<<<< HEAD Vectorized inline operator+(const Vectorized& a, const Vectorized& b) { +======= +Vectorized inline operator+( + const Vectorized& a, + const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return _mm512_add_epi8(a, b); } template <> +<<<<<<< HEAD Vectorized inline operator+(const Vectorized& a, const Vectorized& b) { +======= +Vectorized inline operator+( + const Vectorized& a, + const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return _mm512_add_epi8(a, b); } template <> +<<<<<<< HEAD Vectorized inline operator-(const Vectorized& a, const Vectorized& b) { +======= +Vectorized inline operator-( + const Vectorized& a, + const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return _mm512_sub_epi64(a, b); } template <> +<<<<<<< HEAD Vectorized inline operator-(const Vectorized& a, const Vectorized& b) { +======= +Vectorized inline operator-( + const Vectorized& a, + const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return _mm512_sub_epi32(a, b); } template <> +<<<<<<< HEAD Vectorized inline operator-(const Vectorized& a, const Vectorized& b) { +======= +Vectorized inline operator-( + const Vectorized& a, + const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return _mm512_sub_epi16(a, b); } template <> +<<<<<<< HEAD Vectorized inline operator-(const Vectorized& a, const Vectorized& b) { +======= +Vectorized inline operator-( + const Vectorized& a, + const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return _mm512_sub_epi8(a, b); } template <> +<<<<<<< HEAD Vectorized inline operator-(const Vectorized& a, const Vectorized& b) { +======= +Vectorized inline operator-( + const Vectorized& a, + const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return _mm512_sub_epi8(a, b); } @@ -966,22 +1655,47 @@ inline Vectorized Vectorized::neg() const { } template <> +<<<<<<< HEAD Vectorized inline operator*(const Vectorized& a, const Vectorized& b) { +======= +Vectorized inline operator*( + const Vectorized& a, + const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return _mm512_mullo_epi64(a, b); } template <> +<<<<<<< HEAD Vectorized inline operator*(const Vectorized& a, const Vectorized& b) { +======= +Vectorized inline operator*( + const Vectorized& a, + const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return _mm512_mullo_epi32(a, b); } template <> +<<<<<<< HEAD Vectorized inline operator*(const Vectorized& a, const Vectorized& b) { +======= +Vectorized inline operator*( + const Vectorized& a, + const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return _mm512_mullo_epi16(a, b); } template +<<<<<<< HEAD Vectorized inline int_elementwise_binary_512(const Vectorized& a, const Vectorized& b, Op op) { +======= +Vectorized inline int_elementwise_binary_512( + const Vectorized& a, + const Vectorized& b, + Op op) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) T values_a[Vectorized::size()]; T values_b[Vectorized::size()]; a.store(values_a); @@ -993,7 +1707,13 @@ Vectorized inline int_elementwise_binary_512(const Vectorized& a, const Ve } template <> +<<<<<<< HEAD Vectorized inline operator*(const Vectorized& a, const Vectorized& b) { +======= +Vectorized inline operator*( + const Vectorized& a, + const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // We don't have an instruction for multiplying int8_t #ifndef CPU_CAPABILITY_AVX512 return int_elementwise_binary_512(a, b, std::multiplies()); @@ -1011,14 +1731,25 @@ Vectorized inline operator*(const Vectorized& a, const Vectorize } template <> +<<<<<<< HEAD Vectorized inline operator*(const Vectorized& a, const Vectorized& b) { +======= +Vectorized inline operator*( + const Vectorized& a, + const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // We don't have an instruction for multiplying uint8_t #ifndef CPU_CAPABILITY_AVX512 return int_elementwise_binary_512(a, b, std::multiplies()); #else __m512i mask00FF = _mm512_set1_epi16(0x00FF); +<<<<<<< HEAD __m512i a_lo = _mm512_and_si512 (a, mask00FF); __m512i b_lo = _mm512_and_si512 (b, mask00FF); +======= + __m512i a_lo = _mm512_and_si512(a, mask00FF); + __m512i b_lo = _mm512_and_si512(b, mask00FF); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) __m512i a_hi = _mm512_srli_epi16(a, 8); __m512i b_hi = _mm512_srli_epi16(b, 8); __m512i res_lo = _mm512_and_si512(_mm512_mullo_epi16(a_lo, b_lo), mask00FF); @@ -1029,126 +1760,276 @@ Vectorized inline operator*(const Vectorized& a, const Vectori } template <> +<<<<<<< HEAD Vectorized inline minimum(const Vectorized& a, const Vectorized& b) { +======= +Vectorized inline minimum( + const Vectorized& a, + const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return _mm512_min_epi64(a, b); } template <> +<<<<<<< HEAD Vectorized inline minimum(const Vectorized& a, const Vectorized& b) { +======= +Vectorized inline minimum( + const Vectorized& a, + const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return _mm512_min_epi32(a, b); } template <> +<<<<<<< HEAD Vectorized inline minimum(const Vectorized& a, const Vectorized& b) { +======= +Vectorized inline minimum( + const Vectorized& a, + const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return _mm512_min_epi16(a, b); } template <> +<<<<<<< HEAD Vectorized inline minimum(const Vectorized& a, const Vectorized& b) { +======= +Vectorized inline minimum( + const Vectorized& a, + const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return _mm512_min_epi8(a, b); } template <> +<<<<<<< HEAD Vectorized inline minimum(const Vectorized& a, const Vectorized& b) { +======= +Vectorized inline minimum( + const Vectorized& a, + const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return _mm512_min_epu8(a, b); } template <> +<<<<<<< HEAD Vectorized inline maximum(const Vectorized& a, const Vectorized& b) { +======= +Vectorized inline maximum( + const Vectorized& a, + const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return _mm512_max_epi64(a, b); } template <> +<<<<<<< HEAD Vectorized inline maximum(const Vectorized& a, const Vectorized& b) { +======= +Vectorized inline maximum( + const Vectorized& a, + const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return _mm512_max_epi32(a, b); } template <> +<<<<<<< HEAD Vectorized inline maximum(const Vectorized& a, const Vectorized& b) { +======= +Vectorized inline maximum( + const Vectorized& a, + const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return _mm512_max_epi16(a, b); } template <> +<<<<<<< HEAD Vectorized inline maximum(const Vectorized& a, const Vectorized& b) { +======= +Vectorized inline maximum( + const Vectorized& a, + const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return _mm512_max_epi8(a, b); } template <> +<<<<<<< HEAD Vectorized inline maximum(const Vectorized& a, const Vectorized& b) { +======= +Vectorized inline maximum( + const Vectorized& a, + const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return _mm512_max_epu8(a, b); } template <> +<<<<<<< HEAD Vectorized inline clamp(const Vectorized& a, const Vectorized& min_val, const Vectorized& max_val) { +======= +Vectorized inline clamp( + const Vectorized& a, + const Vectorized& min_val, + const Vectorized& max_val) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return _mm512_min_epi64(max_val, _mm512_max_epi64(a, min_val)); } template <> +<<<<<<< HEAD Vectorized inline clamp(const Vectorized& a, const Vectorized& min_val, const Vectorized& max_val) { +======= +Vectorized inline clamp( + const Vectorized& a, + const Vectorized& min_val, + const Vectorized& max_val) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return _mm512_min_epi32(max_val, _mm512_max_epi32(a, min_val)); } template <> +<<<<<<< HEAD Vectorized inline clamp(const Vectorized& a, const Vectorized& min_val, const Vectorized& max_val) { +======= +Vectorized inline clamp( + const Vectorized& a, + const Vectorized& min_val, + const Vectorized& max_val) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return _mm512_min_epi16(max_val, _mm512_max_epi16(a, min_val)); } template <> +<<<<<<< HEAD Vectorized inline clamp(const Vectorized& a, const Vectorized& min_val, const Vectorized& max_val) { +======= +Vectorized inline clamp( + const Vectorized& a, + const Vectorized& min_val, + const Vectorized& max_val) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return _mm512_min_epi8(max_val, _mm512_max_epi8(a, min_val)); } template <> +<<<<<<< HEAD Vectorized inline clamp(const Vectorized& a, const Vectorized& min_val, const Vectorized& max_val) { +======= +Vectorized inline clamp( + const Vectorized& a, + const Vectorized& min_val, + const Vectorized& max_val) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return _mm512_min_epu8(max_val, _mm512_max_epu8(a, min_val)); } template <> +<<<<<<< HEAD Vectorized inline clamp_max(const Vectorized& a, const Vectorized& max_val) { +======= +Vectorized inline clamp_max( + const Vectorized& a, + const Vectorized& max_val) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return _mm512_min_epi64(max_val, a); } template <> +<<<<<<< HEAD Vectorized inline clamp_max(const Vectorized& a, const Vectorized& max_val) { +======= +Vectorized inline clamp_max( + const Vectorized& a, + const Vectorized& max_val) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return _mm512_min_epi32(max_val, a); } template <> +<<<<<<< HEAD Vectorized inline clamp_max(const Vectorized& a, const Vectorized& max_val) { +======= +Vectorized inline clamp_max( + const Vectorized& a, + const Vectorized& max_val) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return _mm512_min_epi16(max_val, a); } template <> +<<<<<<< HEAD Vectorized inline clamp_max(const Vectorized& a, const Vectorized& max_val) { +======= +Vectorized inline clamp_max( + const Vectorized& a, + const Vectorized& max_val) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return _mm512_min_epi8(max_val, a); } template <> +<<<<<<< HEAD Vectorized inline clamp_max(const Vectorized& a, const Vectorized& max_val) { +======= +Vectorized inline clamp_max( + const Vectorized& a, + const Vectorized& max_val) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return _mm512_min_epu8(max_val, a); } template <> +<<<<<<< HEAD Vectorized inline clamp_min(const Vectorized& a, const Vectorized& min_val) { +======= +Vectorized inline clamp_min( + const Vectorized& a, + const Vectorized& min_val) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return _mm512_max_epi64(min_val, a); } template <> +<<<<<<< HEAD Vectorized inline clamp_min(const Vectorized& a, const Vectorized& min_val) { +======= +Vectorized inline clamp_min( + const Vectorized& a, + const Vectorized& min_val) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return _mm512_max_epi32(min_val, a); } template <> +<<<<<<< HEAD Vectorized inline clamp_min(const Vectorized& a, const Vectorized& min_val) { +======= +Vectorized inline clamp_min( + const Vectorized& a, + const Vectorized& min_val) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return _mm512_max_epi16(min_val, a); } template <> +<<<<<<< HEAD Vectorized inline clamp_min(const Vectorized& a, const Vectorized& min_val) { +======= +Vectorized inline clamp_min( + const Vectorized& a, + const Vectorized& min_val) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return _mm512_max_epi8(min_val, a); } template <> +<<<<<<< HEAD Vectorized inline clamp_min(const Vectorized& a, const Vectorized& min_val) { return _mm512_max_epu8(min_val, a); } @@ -1164,17 +2045,52 @@ std::enable_if_t, Vectorized> inline convert_to_int32(const int8_t* ptr, int count=Vectorized::size()) { if (count == Vectorized::size()) { return _mm512_cvtepi8_epi32(_mm_loadu_si128(reinterpret_cast(ptr))); +======= +Vectorized inline clamp_min( + const Vectorized& a, + const Vectorized& min_val) { + return _mm512_max_epu8(min_val, a); +} + +template +std::enable_if_t< + !(std::is_same_v || std::is_same_v), + Vectorized< + int32_t>> inline convert_to_int32(const T* ptr, int count = Vectorized::size()) { + return Vectorized::loadu(ptr, count); +} + +template +std:: + enable_if_t, Vectorized> inline convert_to_int32( + const int8_t* ptr, + int count = Vectorized::size()) { + if (count == Vectorized::size()) { + return _mm512_cvtepi8_epi32( + _mm_loadu_si128(reinterpret_cast(ptr))); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } else { auto a = Vectorized::loadu(ptr, count); return _mm512_cvtepi8_epi32(_mm512_castsi512_si128(a)); } } +<<<<<<< HEAD template std::enable_if_t, Vectorized> inline convert_to_int32(const uint8_t* ptr, int count=Vectorized::size()) { if (count == Vectorized::size()) { return _mm512_cvtepu8_epi32(_mm_loadu_si128(reinterpret_cast(ptr))); +======= +template +std:: + enable_if_t, Vectorized> inline convert_to_int32( + const uint8_t* ptr, + int count = Vectorized::size()) { + if (count == Vectorized::size()) { + return _mm512_cvtepu8_epi32( + _mm_loadu_si128(reinterpret_cast(ptr))); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } else { auto a = Vectorized::loadu(ptr, count); return _mm512_cvtepu8_epi32(_mm512_castsi512_si128(a)); @@ -1182,6 +2098,7 @@ inline convert_to_int32(const uint8_t* ptr, int count=Vectorized::size( } template <> +<<<<<<< HEAD Vectorized inline operator/(const Vectorized& a, const Vectorized& b) { return int_elementwise_binary_512(a, b, std::divides()); } @@ -1215,10 +2132,72 @@ inline Vectorized operator^(const Vectorized& a, const Vectorized& b) { return _mm512_xor_si512(a, b); } template>::value, int> = 0> +======= +Vectorized inline operator/( + const Vectorized& a, + const Vectorized& b) { + return int_elementwise_binary_512(a, b, std::divides()); +} +template <> +Vectorized inline operator/( + const Vectorized& a, + const Vectorized& b) { + return int_elementwise_binary_512(a, b, std::divides()); +} +template <> +Vectorized inline operator/( + const Vectorized& a, + const Vectorized& b) { + return int_elementwise_binary_512(a, b, std::divides()); +} +template <> +Vectorized inline operator/( + const Vectorized& a, + const Vectorized& b) { + return int_elementwise_binary_512(a, b, std::divides()); +} +template <> +Vectorized inline operator/( + const Vectorized& a, + const Vectorized& b) { + return int_elementwise_binary_512(a, b, std::divides()); +} + +template < + class T, + typename std::enable_if_t< + std::is_base_of>::value, + int> = 0> +inline Vectorized operator&(const Vectorized& a, const Vectorized& b) { + return _mm512_and_si512(a, b); +} +template < + class T, + typename std::enable_if_t< + std::is_base_of>::value, + int> = 0> +inline Vectorized operator|(const Vectorized& a, const Vectorized& b) { + return _mm512_or_si512(a, b); +} +template < + class T, + typename std::enable_if_t< + std::is_base_of>::value, + int> = 0> +inline Vectorized operator^(const Vectorized& a, const Vectorized& b) { + return _mm512_xor_si512(a, b); +} +template < + class T, + typename std::enable_if_t< + std::is_base_of>::value, + int> = 0> +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) inline Vectorized operator~(const Vectorized& a) { return _mm512_xor_si512(a, _mm512_set1_epi32(-1)); } +<<<<<<< HEAD inline Vectorized Vectorized::eq(const Vectorized& other) const { return (*this == other) & Vectorized(1); } @@ -1341,6 +2320,167 @@ inline Vectorized Vectorized::le(const Vectorized& ot template || std::is_same_v, int> = 0> Vectorized inline shift_512_8(const Vectorized& a, const Vectorized& b) { +======= +inline Vectorized Vectorized::eq( + const Vectorized& other) const { + return (*this == other) & Vectorized(1); +} + +inline Vectorized Vectorized::ne( + const Vectorized& other) const { + return (*this != other) & Vectorized(1); +} + +inline Vectorized Vectorized::gt( + const Vectorized& other) const { + return (*this > other) & Vectorized(1); +} + +inline Vectorized Vectorized::ge( + const Vectorized& other) const { + return (*this >= other) & Vectorized(1); +} + +inline Vectorized Vectorized::lt( + const Vectorized& other) const { + return (*this < other) & Vectorized(1); +} + +inline Vectorized Vectorized::le( + const Vectorized& other) const { + return (*this <= other) & Vectorized(1); +} + +inline Vectorized Vectorized::eq( + const Vectorized& other) const { + return (*this == other) & Vectorized(1); +} + +inline Vectorized Vectorized::ne( + const Vectorized& other) const { + return (*this != other) & Vectorized(1); +} + +inline Vectorized Vectorized::gt( + const Vectorized& other) const { + return (*this > other) & Vectorized(1); +} + +inline Vectorized Vectorized::ge( + const Vectorized& other) const { + return (*this >= other) & Vectorized(1); +} + +inline Vectorized Vectorized::lt( + const Vectorized& other) const { + return (*this < other) & Vectorized(1); +} + +inline Vectorized Vectorized::le( + const Vectorized& other) const { + return (*this <= other) & Vectorized(1); +} + +inline Vectorized Vectorized::eq( + const Vectorized& other) const { + return (*this == other) & Vectorized(1); +} + +inline Vectorized Vectorized::ne( + const Vectorized& other) const { + return (*this != other) & Vectorized(1); +} + +inline Vectorized Vectorized::gt( + const Vectorized& other) const { + return (*this > other) & Vectorized(1); +} + +inline Vectorized Vectorized::ge( + const Vectorized& other) const { + return (*this >= other) & Vectorized(1); +} + +inline Vectorized Vectorized::lt( + const Vectorized& other) const { + return (*this < other) & Vectorized(1); +} + +inline Vectorized Vectorized::le( + const Vectorized& other) const { + return (*this <= other) & Vectorized(1); +} + +inline Vectorized Vectorized::eq( + const Vectorized& other) const { + return (*this == other) & Vectorized(1); +} + +inline Vectorized Vectorized::ne( + const Vectorized& other) const { + return (*this != other) & Vectorized(1); +} + +inline Vectorized Vectorized::gt( + const Vectorized& other) const { + return (*this > other) & Vectorized(1); +} + +inline Vectorized Vectorized::ge( + const Vectorized& other) const { + return (*this >= other) & Vectorized(1); +} + +inline Vectorized Vectorized::lt( + const Vectorized& other) const { + return (*this < other) & Vectorized(1); +} + +inline Vectorized Vectorized::le( + const Vectorized& other) const { + return (*this <= other) & Vectorized(1); +} + +inline Vectorized Vectorized::eq( + const Vectorized& other) const { + return (*this == other) & Vectorized(1); +} + +inline Vectorized Vectorized::ne( + const Vectorized& other) const { + return (*this != other) & Vectorized(1); +} + +inline Vectorized Vectorized::gt( + const Vectorized& other) const { + return (*this > other) & Vectorized(1); +} + +inline Vectorized Vectorized::ge( + const Vectorized& other) const { + return (*this >= other) & Vectorized(1); +} + +inline Vectorized Vectorized::lt( + const Vectorized& other) const { + return (*this < other) & Vectorized(1); +} + +inline Vectorized Vectorized::le( + const Vectorized& other) const { + return (*this <= other) & Vectorized(1); +} + +template < + bool left_shift, + typename T, + typename std::enable_if_t< + std::is_same_v || std::is_same_v, + int> = 0> +Vectorized inline shift_512_8( + const Vectorized& a, + const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // No vector instruction for shifting int8_t/uint8_t, so emulating // it instead. @@ -1350,6 +2490,7 @@ Vectorized inline shift_512_8(const Vectorized& a, const Vectorized& b) // M!=N) is set so that shuffle will move element with index M from // input pair into element with index N in output pair, and element // with index M in output pair will be set to all 0s. +<<<<<<< HEAD __m512i ctl_0_1 = _mm512_set_epi8(62, 0x80, 60, 0x80, 58, 0x80, 56, 0x80, 54, 0x80, 52, 0x80, 50, 0x80, 48, 0x80, 46, 0x80, 44, 0x80, 42, 0x80, 40, 0x80, @@ -1366,6 +2507,138 @@ Vectorized inline shift_512_8(const Vectorized& a, const Vectorized& b) 0x80, 23, 0x80, 21, 0x80, 19, 0x80, 17, 0x80, 15, 0x80, 13, 0x80, 11, 0x80, 9, 0x80, 7, 0x80, 5, 0x80, 3, 0x80, 1); +======= + __m512i ctl_0_1 = _mm512_set_epi8( + 62, + 0x80, + 60, + 0x80, + 58, + 0x80, + 56, + 0x80, + 54, + 0x80, + 52, + 0x80, + 50, + 0x80, + 48, + 0x80, + 46, + 0x80, + 44, + 0x80, + 42, + 0x80, + 40, + 0x80, + 38, + 0x80, + 36, + 0x80, + 34, + 0x80, + 32, + 0x80, + 30, + 0x80, + 28, + 0x80, + 26, + 0x80, + 24, + 0x80, + 22, + 0x80, + 20, + 0x80, + 18, + 0x80, + 16, + 0x80, + 14, + 0x80, + 12, + 0x80, + 10, + 0x80, + 8, + 0x80, + 6, + 0x80, + 4, + 0x80, + 2, + 0x80, + 0, + 0x80); + __m512i ctl_1_0 = _mm512_set_epi8( + 0x80, + 63, + 0x80, + 61, + 0x80, + 59, + 0x80, + 57, + 0x80, + 55, + 0x80, + 53, + 0x80, + 51, + 0x80, + 49, + 0x80, + 47, + 0x80, + 45, + 0x80, + 43, + 0x80, + 41, + 0x80, + 39, + 0x80, + 37, + 0x80, + 35, + 0x80, + 33, + 0x80, + 31, + 0x80, + 29, + 0x80, + 27, + 0x80, + 25, + 0x80, + 23, + 0x80, + 21, + 0x80, + 19, + 0x80, + 17, + 0x80, + 15, + 0x80, + 13, + 0x80, + 11, + 0x80, + 9, + 0x80, + 7, + 0x80, + 5, + 0x80, + 3, + 0x80, + 1); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // Masks for bitwise and operation, treating 512 bits as an array of // 8-bit elements, and considering them in pairs of neighboring @@ -1396,11 +2669,18 @@ Vectorized inline shift_512_8(const Vectorized& a, const Vectorized& b) __m512i c0; if (left_shift) c0 = _mm512_sllv_epi16(a0, b0); +<<<<<<< HEAD else if constexpr (std::is_same_v) c0 = _mm512_srav_epi16(a0, b0); else c0 = _mm512_srlv_epi16(a0, b0); +======= + else if constexpr (std::is_same_v) + c0 = _mm512_srav_epi16(a0, b0); + else + c0 = _mm512_srlv_epi16(a0, b0); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) c0 = _mm512_shuffle_epi8(c0, ctl_1_0); // Peform shifting the same way for input array elements with @@ -1410,11 +2690,18 @@ Vectorized inline shift_512_8(const Vectorized& a, const Vectorized& b) __m512i c1; if (left_shift) c1 = _mm512_sllv_epi16(a1, b1); +<<<<<<< HEAD else if constexpr (std::is_same_v) c1 = _mm512_srav_epi16(a1, b1); else c1 = _mm512_srlv_epi16(a1, b1); +======= + else if constexpr (std::is_same_v) + c1 = _mm512_srav_epi16(a1, b1); + else + c1 = _mm512_srlv_epi16(a1, b1); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) c1 = _mm512_and_si512(c1, keep_1); // Merge partial results into the final result. @@ -1424,55 +2711,120 @@ Vectorized inline shift_512_8(const Vectorized& a, const Vectorized& b) } template <> +<<<<<<< HEAD Vectorized inline operator<<(const Vectorized& a, const Vectorized& b) { +======= +Vectorized inline operator<<( + const Vectorized& a, + const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return _mm512_sllv_epi64(a, b); } template <> +<<<<<<< HEAD Vectorized inline operator<<(const Vectorized& a, const Vectorized& b) { +======= +Vectorized inline operator<<( + const Vectorized& a, + const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return _mm512_sllv_epi32(a, b); } template <> +<<<<<<< HEAD Vectorized inline operator<<(const Vectorized& a, const Vectorized& b) { +======= +Vectorized inline operator<<( + const Vectorized& a, + const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return _mm512_sllv_epi16(a, b); } template <> +<<<<<<< HEAD Vectorized inline operator<<(const Vectorized& a, const Vectorized& b) { +======= +Vectorized inline operator<<( + const Vectorized& a, + const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return shift_512_8(a, b); } template <> +<<<<<<< HEAD Vectorized inline operator<<(const Vectorized& a, const Vectorized& b) { +======= +Vectorized inline operator<<( + const Vectorized& a, + const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return shift_512_8(a, b); } template <> +<<<<<<< HEAD Vectorized inline operator>>(const Vectorized& a, const Vectorized& b) { +======= +Vectorized inline operator>>( + const Vectorized& a, + const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return _mm512_srav_epi64(a, b); } template <> +<<<<<<< HEAD Vectorized inline operator>>(const Vectorized& a, const Vectorized& b) { +======= +Vectorized inline operator>>( + const Vectorized& a, + const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return _mm512_srav_epi32(a, b); } template <> +<<<<<<< HEAD Vectorized inline operator>>(const Vectorized& a, const Vectorized& b) { +======= +Vectorized inline operator>>( + const Vectorized& a, + const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return _mm512_srav_epi16(a, b); } template <> +<<<<<<< HEAD Vectorized inline operator>>(const Vectorized& a, const Vectorized& b) { +======= +Vectorized inline operator>>( + const Vectorized& a, + const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return shift_512_8(a, b); } template <> +<<<<<<< HEAD Vectorized inline operator>>(const Vectorized& a, const Vectorized& b) { +======= +Vectorized inline operator>>( + const Vectorized& a, + const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return shift_512_8(a, b); } #endif +<<<<<<< HEAD }} +======= +} // namespace CPU_CAPABILITY +} // namespace at::vec +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_mask.h b/aten/src/ATen/cpu/vec/vec512/vec512_mask.h index d32e1da1cf72..a40cdcd3b8eb 100644 --- a/aten/src/ATen/cpu/vec/vec512/vec512_mask.h +++ b/aten/src/ATen/cpu/vec/vec512/vec512_mask.h @@ -85,8 +85,12 @@ struct VecMaskLoad< mask_t, dst_n, std::enable_if_t< +<<<<<<< HEAD std::is_same_v || std::is_same_v>> { +======= + std::is_same_v || std::is_same_v>> { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) static inline VectorizedN apply( const data_t* ptr, const VecMask& vec_mask) { @@ -152,8 +156,12 @@ struct VecMaskLoad< mask_t, 1, std::enable_if_t< +<<<<<<< HEAD std::is_same_v || std::is_same_v>> { +======= + std::is_same_v || std::is_same_v>> { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) static inline VectorizedN apply( const data_t* ptr, const VecMask& vec_mask) { @@ -174,8 +182,12 @@ struct VecMaskLoad< mask_t, 1, std::enable_if_t< +<<<<<<< HEAD std::is_same_v || std::is_same_v>> { +======= + std::is_same_v || std::is_same_v>> { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) static inline VectorizedN apply( const data_t* ptr, const VecMask& vec_mask) { diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_qint.h b/aten/src/ATen/cpu/vec/vec512/vec512_qint.h index ec14ef51601b..264145b610cd 100644 --- a/aten/src/ATen/cpu/vec/vec512/vec512_qint.h +++ b/aten/src/ATen/cpu/vec/vec512/vec512_qint.h @@ -35,8 +35,13 @@ // specified by float_vec_return_type. // // When writing kernels with these vectors, it is expected that floating- +<<<<<<< HEAD // point operations will be carried out in a loop over Vectorized::float_num_vecs // iterations. +======= +// point operations will be carried out in a loop over +// Vectorized::float_num_vecs iterations. +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) namespace at { namespace vec { @@ -62,7 +67,10 @@ struct Vectorizedqi { } }; +<<<<<<< HEAD +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) template __m512i pack_saturate_and_clamp( __m512i first, @@ -106,10 +114,19 @@ inline __m512i pack_saturate_and_clamp( } template +<<<<<<< HEAD typename std::enable_if_t || std::is_same_v, at::vec::Vectorized> inline convert_int8_to_float(at::vec::Vectorized src) { // Note: this function only convert inputs number of elements equal to at::vec::Vectorized.size() // Only handle first 16*8 bits +======= +typename std::enable_if_t< + std::is_same_v || std::is_same_v, + at::vec::Vectorized< + float>> inline convert_int8_to_float(at::vec::Vectorized src) { + // Note: this function only convert inputs number of elements equal to + // at::vec::Vectorized.size() Only handle first 16*8 bits +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) __m128i input_128 = _mm512_castsi512_si128(src); // Convert from 16*uint8/int8 to 16*int32 __m512i input_512_extended; @@ -122,8 +139,15 @@ inline convert_int8_to_float(at::vec::Vectorized src) { } template +<<<<<<< HEAD typename std::enable_if_t || std::is_same_v, at::vec::Vectorized> inline convert_float_to_int8(at::vec::Vectorized src) { +======= +typename std::enable_if_t< + std::is_same_v || std::is_same_v, + at::vec::Vectorized< + T>> inline convert_float_to_int8(at::vec::Vectorized src) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // Convert from float32 to int32 with truncation __m512i x_values_int32 = _mm512_cvttps_epi32(src); @@ -134,11 +158,33 @@ inline convert_float_to_int8(at::vec::Vectorized src) { constexpr auto max_val = std::numeric_limits::max(); // Convert from int16 to uint8/int8 using unsigned saturation +<<<<<<< HEAD __m512i xyzw_clamped_v = pack_saturate_and_clamp( xy_packed_v, xy_packed_v, min_val, max_val); __m512i permute_mask_v = _mm512_set_epi32(0x0f, 0x0b, 0x07, 0x03, 0x0e, 0x0a, 0x06, 0x02, 0x0d, 0x09, 0x05, 0x01, 0x0c, 0x08, 0x04, 0x00); +======= + __m512i xyzw_clamped_v = + pack_saturate_and_clamp(xy_packed_v, xy_packed_v, min_val, max_val); + __m512i permute_mask_v = _mm512_set_epi32( + 0x0f, + 0x0b, + 0x07, + 0x03, + 0x0e, + 0x0a, + 0x06, + 0x02, + 0x0d, + 0x09, + 0x05, + 0x01, + 0x0c, + 0x08, + 0x04, + 0x00); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return _mm512_permutexvar_epi32(permute_mask_v, xyzw_clamped_v); } @@ -178,12 +224,49 @@ __FORCE_INLINE void QuantizeAvx512( 0xff, 0xff, 0xff, 0xff, 0x0c, 0x08, 0x04, 0x00); // clang-format on +<<<<<<< HEAD __m512i permute_mask_v = _mm512_set_epi32(0x0f, 0x0b, 0x07, 0x03, 0x0e, 0x0a, 0x06, 0x02, 0x0d, 0x09, 0x05, 0x01, 0x0c, 0x08, 0x04, 0x00); __m512i permute_mask_l8_v = _mm512_set_epi32(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0c, 0x08, 0x04, 0x00); +======= + __m512i permute_mask_v = _mm512_set_epi32( + 0x0f, + 0x0b, + 0x07, + 0x03, + 0x0e, + 0x0a, + 0x06, + 0x02, + 0x0d, + 0x09, + 0x05, + 0x01, + 0x0c, + 0x08, + 0x04, + 0x00); + __m512i permute_mask_l8_v = _mm512_set_epi32( + 0x00, + 0x00, + 0x00, + 0x00, + 0x00, + 0x00, + 0x00, + 0x00, + 0x00, + 0x00, + 0x00, + 0x00, + 0x0c, + 0x08, + 0x04, + 0x00); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) int len_aligned = len / (VLEN * 4) * (VLEN * 4); for (; i < len_aligned; i += 4 * VLEN) { // x @@ -226,8 +309,12 @@ __FORCE_INLINE void QuantizeAvx512( __m512i xyzw_clamped_v = pack_saturate_and_clamp(xy_packed_v, zw_packed_v, min_val, max_val); +<<<<<<< HEAD xyzw_clamped_v = _mm512_permutexvar_epi32(permute_mask_v, xyzw_clamped_v); +======= + xyzw_clamped_v = _mm512_permutexvar_epi32(permute_mask_v, xyzw_clamped_v); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) _mm512_storeu_si512(reinterpret_cast<__m512i*>(dst + i), xyzw_clamped_v); } @@ -269,6 +356,7 @@ __FORCE_INLINE void QuantizeAvx512( } } +<<<<<<< HEAD template<> struct Vectorized : public Vectorizedqi { using size_type = int; @@ -395,6 +483,145 @@ struct Vectorized : public Vectorizedqi { template <> Vectorized inline maximum(const Vectorized& a, const Vectorized& b) { +======= +template <> +struct is_vec_specialized_for : std::bool_constant {}; + +template <> +struct Vectorized : public Vectorizedqi { + using size_type = int; + static constexpr size_type size() { + return 16; + } + + static constexpr int float_num_vecs() { + return 1; + } + + static constexpr int int_num_vecs() { + return 1; + } + + using float_vec_return_type = std::array, 1>; + using int_vec_return_type = std::array, 1>; + using value_type = c10::qint32::underlying; + + public: + using Vectorizedqi::Vectorizedqi; + Vectorized() {} + + Vectorized(__m512i vals_) { + vals = vals_; + } + + // Broadcast constructor + Vectorized(const c10::qint32& val) { + value_type uw = val.val_; + vals = _mm512_set1_epi32(uw); + } + + void store(void* ptr, int count = size()) const { + if (count != size()) { + memcpy(ptr, &vals, count * sizeof(value_type)); + } else { + _mm512_storeu_si512((__m512i*)ptr, vals); + } + } + + static Vectorized loadu(const void* ptr) { + return Vectorized(ptr); + } + + static Vectorized loadu(const void* ptr, int64_t count) { + __at_align__ value_type tmp_values[size()]; + // Ensure uninitialized memory does not change the output value See + // https://github.com/pytorch/pytorch/issues/32502 for more details. We do + // not initialize arrays to zero using "={0}" because gcc would compile it + // to two instructions while a loop would be compiled to one instruction. + for (const auto i : c10::irange(size())) { + tmp_values[i] = 0; + } + std::memcpy( + tmp_values, + reinterpret_cast(ptr), + count * sizeof(value_type)); + return loadu(tmp_values); + } + + float_vec_return_type dequantize( + Vectorized scale, + Vectorized zero_point, + Vectorized scale_zp_premul) const { + __m512 float_vals = _mm512_cvtepi32_ps(vals); + return {vec::fmadd(scale, Vectorized(float_vals), scale_zp_premul)}; + } + + float_vec_return_type dequantize( + Vectorized scale, + Vectorized zero_point) const { + __m512 float_vals = _mm512_cvtepi32_ps(vals); + return {(Vectorized(float_vals) - zero_point) * scale}; + } + + static Vectorized quantize( + const float_vec_return_type& rhs, + float scale, + int32_t zero_point, + float inverse_scale [[maybe_unused]]) { + Vectorized retval; + auto rhs_data = (__m512)rhs[0]; + at::native::quantize_vec( + scale, zero_point, (float*)&rhs_data, (c10::qint32*)&retval.vals, 16); + return retval; + } + + Vectorized maximum(Vectorized b) const { + return _mm512_max_epi32(vals, b.vals); + } + + Vectorized minimum(Vectorized b) const { + return _mm512_min_epi32(vals, b.vals); + } + + Vectorized relu(Vectorized zero_point) const { + return maximum(zero_point); + } + + Vectorized relu6( + Vectorized zero_point, + Vectorized q_six) { + return _mm512_min_epi32( + _mm512_max_epi32(vals, zero_point.vals), q_six.vals); + } + + int_vec_return_type widening_subtract(Vectorized b) const { + return {_mm512_sub_epi32(vals, b)}; + } + + static Vectorized requantize_from_int( + const int_vec_return_type& inp, + float multiplier, + int32_t zero_point) { + __m512 multiplier_v = _mm512_set1_ps(multiplier); + __m512i zero_point_v = _mm512_set1_epi32(zero_point); + + __m512 scaled = _mm512_mul_ps(_mm512_cvtepi32_ps(inp[0]), multiplier_v); + __m512i rounded = _mm512_cvtps_epi32(scaled); + return _mm512_add_epi32(rounded, zero_point_v); + } + + private: + // Load from memory constructor + Vectorized(const void* ptr) { + vals = _mm512_loadu_si512((const __m512i*)ptr); + } +}; + +template <> +Vectorized inline maximum( + const Vectorized& a, + const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return a.maximum(b); } @@ -425,9 +652,29 @@ __m512i RequantizeAvx512( "Only int8_t/uint8_t are supported"); constexpr auto min_val = std::numeric_limits::min(); constexpr auto max_val = std::numeric_limits::max(); +<<<<<<< HEAD __m512i permute_mask_v = _mm512_set_epi32(0x0f, 0x0b, 0x07, 0x03, 0x0e, 0x0a, 0x06, 0x02, 0x0d, 0x09, 0x05, 0x01, 0x0c, 0x08, 0x04, 0x00); +======= + __m512i permute_mask_v = _mm512_set_epi32( + 0x0f, + 0x0b, + 0x07, + 0x03, + 0x0e, + 0x0a, + 0x06, + 0x02, + 0x0d, + 0x09, + 0x05, + 0x01, + 0x0c, + 0x08, + 0x04, + 0x00); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) __m512 x_scaled_v = _mm512_mul_ps(_mm512_cvtepi32_ps(inp[0]), multiplier); __m512 y_scaled_v = _mm512_mul_ps(_mm512_cvtepi32_ps(inp[1]), multiplier); __m512 z_scaled_v = _mm512_mul_ps(_mm512_cvtepi32_ps(inp[2]), multiplier); @@ -453,12 +700,18 @@ __m512i RequantizeAvx512( /* * xyzw_clamped_v has results in the following layout so we need to +<<<<<<< HEAD * permute: x0-3 y0-3 z0-3 w0-3 x4-7 y4-7 z4-7 w4-7 x8-11 y8-11 z8-11 w8-11 x12-15 y12-15 z12-15 w12-15 +======= + * permute: x0-3 y0-3 z0-3 w0-3 x4-7 y4-7 z4-7 w4-7 x8-11 y8-11 z8-11 w8-11 + * x12-15 y12-15 z12-15 w12-15 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) */ xyzw_clamped_v = _mm512_permutexvar_epi32(permute_mask_v, xyzw_clamped_v); return xyzw_clamped_v; } +<<<<<<< HEAD template<> struct Vectorized : public Vectorizedqi { static constexpr int size() { @@ -526,23 +779,113 @@ struct Vectorized : public Vectorizedqi { __m512i cvtepi8_epi32(__m128i epi8_vals) const { return _mm512_cvtepi8_epi32(epi8_vals); } +======= +template <> +struct is_vec_specialized_for : std::bool_constant {}; + +template <> +struct Vectorized : public Vectorizedqi { + static constexpr int size() { + return 64; + } + + static constexpr int float_num_vecs() { + return 4; + } + + static constexpr int int_num_vecs() { + return 4; + } + + using float_vec_return_type = std::array, 4>; + using int_vec_return_type = std::array, 4>; + using value_type = typename c10::qint8::underlying; + + public: + using Vectorizedqi::Vectorizedqi; + + Vectorized() {} + Vectorized(__m512i vals_) { + vals = vals_; + } + + // Broadcast constructor + Vectorized(const c10::qint8& val) { + value_type uw = val.val_; + vals = _mm512_set1_epi8(uw); + } + + // This is needed because the compiler emits awful code for the default + // constructor for moving the enum + Vectorized(const Vectorized& other) : Vectorizedqi(other.vals) {} + + // This is added to avoid error: definition of implicit copy assignment + // operator for 'Vectorized' is deprecated because it has a + // user-declared copy constructor [-Werror,-Wdeprecated-copy] + Vectorized& operator=(const Vectorized&) = default; + + void store(void* ptr, int count = size()) const { + if (count != size()) { + memcpy(ptr, &vals, count * sizeof(value_type)); + } else { + _mm512_storeu_si512((__m512i*)ptr, vals); + } + } + + static Vectorized loadu(const void* ptr) { + return Vectorized(ptr); + } + + static Vectorized loadu(const void* ptr, int64_t count) { + __at_align__ value_type tmp_values[size()]; + // Ensure uninitialized memory does not change the output value See + // https://github.com/pytorch/pytorch/issues/32502 for more details. We do + // not initialize arrays to zero using "={0}" because gcc would compile it + // to two instructions while a loop would be compiled to one instruction. + for (const auto i : c10::irange(size())) { + tmp_values[i] = 0; + } + std::memcpy( + tmp_values, + reinterpret_cast(ptr), + count * sizeof(value_type)); + return loadu(tmp_values); + } + + private: + __m512i cvtepi8_epi32(__m128i epi8_vals) const { + return _mm512_cvtepi8_epi32(epi8_vals); + } +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) public: float_vec_return_type dequantize( Vectorized scale, Vectorized zero_point, Vectorized scale_neg_zp_premul) const { +<<<<<<< HEAD #if defined(_MSC_VER) && !defined(__clang__) +======= +#if defined(_MSC_VER) && !defined(__clang__) +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) __m128i int_val0 = _mm_set_epi64x(vals.m512i_u64[1], vals.m512i_u64[0]); __m128i int_val1 = _mm_set_epi64x(vals.m512i_u64[3], vals.m512i_u64[2]); __m128i int_val2 = _mm_set_epi64x(vals.m512i_u64[5], vals.m512i_u64[4]); __m128i int_val3 = _mm_set_epi64x(vals.m512i_u64[7], vals.m512i_u64[6]); +<<<<<<< HEAD #else +======= +#else +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) __m128i int_val0 = _mm_set_epi64x(vals[1], vals[0]); __m128i int_val1 = _mm_set_epi64x(vals[3], vals[2]); __m128i int_val2 = _mm_set_epi64x(vals[5], vals[4]); __m128i int_val3 = _mm_set_epi64x(vals[7], vals[6]); +<<<<<<< HEAD #endif +======= +#endif +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) __m512 float_val0 = _mm512_cvtepi32_ps(cvtepi8_epi32(int_val0)); __m512 float_val1 = _mm512_cvtepi32_ps(cvtepi8_epi32(int_val1)); @@ -563,17 +906,29 @@ struct Vectorized : public Vectorizedqi { float_vec_return_type dequantize( Vectorized scale, Vectorized zero_point) const { +<<<<<<< HEAD #if defined(_MSC_VER) && !defined(__clang__) +======= +#if defined(_MSC_VER) && !defined(__clang__) +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) __m128i int_val0 = _mm_set_epi64x(vals.m512i_u64[1], vals.m512i_u64[0]); __m128i int_val1 = _mm_set_epi64x(vals.m512i_u64[3], vals.m512i_u64[2]); __m128i int_val2 = _mm_set_epi64x(vals.m512i_u64[5], vals.m512i_u64[4]); __m128i int_val3 = _mm_set_epi64x(vals.m512i_u64[7], vals.m512i_u64[6]); +<<<<<<< HEAD #else +======= +#else +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) __m128i int_val0 = _mm_set_epi64x(vals[1], vals[0]); __m128i int_val1 = _mm_set_epi64x(vals[3], vals[2]); __m128i int_val2 = _mm_set_epi64x(vals[5], vals[4]); __m128i int_val3 = _mm_set_epi64x(vals[7], vals[6]); +<<<<<<< HEAD #endif +======= +#endif +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) __m512 float_val0 = _mm512_cvtepi32_ps(cvtepi8_epi32(int_val0)); __m512 float_val1 = _mm512_cvtepi32_ps(cvtepi8_epi32(int_val1)); @@ -600,6 +955,7 @@ struct Vectorized : public Vectorizedqi { } Vectorized maximum(Vectorized b) const { +<<<<<<< HEAD return _mm512_max_epi8(vals, b.vals); } @@ -750,23 +1106,198 @@ struct Vectorized : public Vectorizedqi { __m512i cvtepu8_epi32(__m128i epu8_vals) const { return _mm512_cvtepu8_epi32(epu8_vals); } +======= + return _mm512_max_epi8(vals, b.vals); + } + + Vectorized minimum(Vectorized b) const { + return _mm512_min_epi8(vals, b.vals); + } + + Vectorized relu(Vectorized zero_point) const { + return maximum(zero_point); + } + + Vectorized relu6( + Vectorized zero_point, + Vectorized q_six) { + return _mm512_min_epi8(_mm512_max_epi8(vals, zero_point.vals), q_six.vals); + } + + int_vec_return_type widening_subtract(Vectorized b) const { +#if defined(_MSC_VER) && !defined(__clang__) + __m128i int_val0 = _mm_set_epi64x(vals.m512i_u64[1], vals.m512i_u64[0]); + __m128i int_val1 = _mm_set_epi64x(vals.m512i_u64[3], vals.m512i_u64[2]); + __m128i int_val2 = _mm_set_epi64x(vals.m512i_u64[5], vals.m512i_u64[4]); + __m128i int_val3 = _mm_set_epi64x(vals.m512i_u64[7], vals.m512i_u64[6]); +#else + __m128i int_val0 = _mm_set_epi64x(vals[1], vals[0]); + __m128i int_val1 = _mm_set_epi64x(vals[3], vals[2]); + __m128i int_val2 = _mm_set_epi64x(vals[5], vals[4]); + __m128i int_val3 = _mm_set_epi64x(vals[7], vals[6]); +#endif + + __m512i int32_val0 = cvtepi8_epi32(int_val0); + __m512i int32_val1 = cvtepi8_epi32(int_val1); + __m512i int32_val2 = cvtepi8_epi32(int_val2); + __m512i int32_val3 = cvtepi8_epi32(int_val3); + +#if defined(_MSC_VER) && !defined(__clang__) + __m128i int_b0 = _mm_set_epi64x(b.vals.m512i_u64[1], b.vals.m512i_u64[0]); + __m128i int_b1 = _mm_set_epi64x(b.vals.m512i_u64[3], b.vals.m512i_u64[2]); + __m128i int_b2 = _mm_set_epi64x(b.vals.m512i_u64[5], b.vals.m512i_u64[4]); + __m128i int_b3 = _mm_set_epi64x(b.vals.m512i_u64[7], b.vals.m512i_u64[6]); +#else + __m128i int_b0 = _mm_set_epi64x(b.vals[1], b.vals[0]); + __m128i int_b1 = _mm_set_epi64x(b.vals[3], b.vals[2]); + __m128i int_b2 = _mm_set_epi64x(b.vals[5], b.vals[4]); + __m128i int_b3 = _mm_set_epi64x(b.vals[7], b.vals[6]); +#endif + + __m512i int32_b0 = cvtepi8_epi32(int_b0); + __m512i int32_b1 = cvtepi8_epi32(int_b1); + __m512i int32_b2 = cvtepi8_epi32(int_b2); + __m512i int32_b3 = cvtepi8_epi32(int_b3); + + __m512i res_0 = _mm512_sub_epi32(int32_val0, int32_b0); + __m512i res_1 = _mm512_sub_epi32(int32_val1, int32_b1); + __m512i res_2 = _mm512_sub_epi32(int32_val2, int32_b2); + __m512i res_3 = _mm512_sub_epi32(int32_val3, int32_b3); + + return { + Vectorized(res_0), + Vectorized(res_1), + Vectorized(res_2), + Vectorized(res_3)}; + } + + static Vectorized requantize_from_int( + const int_vec_return_type& inp, + float multiplier, + int32_t zero_point) { + __m512 multiplier_v = _mm512_set1_ps(multiplier); + __m512i zero_point_v = _mm512_set1_epi32(zero_point); + return RequantizeAvx512(inp, multiplier_v, zero_point_v); + } + + private: + // Load from memory constructor + Vectorized(const void* ptr) { + vals = _mm512_loadu_si512((const __m512i*)ptr); + } +}; + +template <> +Vectorized inline maximum( + const Vectorized& a, + const Vectorized& b) { + return a.maximum(b); +} + +template <> +struct is_vec_specialized_for : std::bool_constant {}; + +template <> +struct Vectorized : public Vectorizedqi { + static constexpr int size() { + return 64; + } + + static constexpr int float_num_vecs() { + return 4; + } + + static constexpr int int_num_vecs() { + return 4; + } + + using float_vec_return_type = std::array, 4>; + using int_vec_return_type = std::array, 4>; + using value_type = typename c10::quint8::underlying; + + public: + using Vectorizedqi::Vectorizedqi; + Vectorized() {} + + Vectorized(__m512i vals_) { + vals = vals_; + } + + // Broadcast constructor + Vectorized(const c10::quint8& val) { + value_type uw = val.val_; + vals = _mm512_set1_epi8(uw); + } + + Vectorized(const Vectorized& other) : Vectorizedqi(other.vals) {} + + // This is added to avoid error: definition of implicit copy assignment + // operator for 'Vectorized' is deprecated because it has a + // user-declared copy constructor [-Werror,-Wdeprecated-copy] + Vectorized& operator=(const Vectorized&) = default; + + void store(void* ptr, int count = size()) const { + if (count != size()) { + memcpy(ptr, &vals, count * sizeof(value_type)); + } else { + _mm512_storeu_si512((__m512i*)ptr, vals); + } + } + + static Vectorized loadu(const void* ptr) { + return Vectorized(ptr); + } + + static Vectorized loadu(const void* ptr, int64_t count) { + __at_align__ value_type tmp_values[size()]; + // Ensure uninitialized memory does not change the output value See + // https://github.com/pytorch/pytorch/issues/32502 for more details. We do + // not initialize arrays to zero using "={0}" because gcc would compile it + // to two instructions while a loop would be compiled to one instruction. + for (const auto i : c10::irange(size())) { + tmp_values[i] = 0; + } + std::memcpy( + tmp_values, + reinterpret_cast(ptr), + count * sizeof(value_type)); + return loadu(tmp_values); + } + + private: + __m512i cvtepu8_epi32(__m128i epu8_vals) const { + return _mm512_cvtepu8_epi32(epu8_vals); + } +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) public: float_vec_return_type dequantize( Vectorized scale, Vectorized zero_point, Vectorized scale_zp_premul) const { +<<<<<<< HEAD #if defined(_MSC_VER) && !defined(__clang__) +======= +#if defined(_MSC_VER) && !defined(__clang__) +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) __m128i int_val0 = _mm_set_epi64x(vals.m512i_u64[1], vals.m512i_u64[0]); __m128i int_val1 = _mm_set_epi64x(vals.m512i_u64[3], vals.m512i_u64[2]); __m128i int_val2 = _mm_set_epi64x(vals.m512i_u64[5], vals.m512i_u64[4]); __m128i int_val3 = _mm_set_epi64x(vals.m512i_u64[7], vals.m512i_u64[6]); +<<<<<<< HEAD #else +======= +#else +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) __m128i int_val0 = _mm_set_epi64x(vals[1], vals[0]); __m128i int_val1 = _mm_set_epi64x(vals[3], vals[2]); __m128i int_val2 = _mm_set_epi64x(vals[5], vals[4]); __m128i int_val3 = _mm_set_epi64x(vals[7], vals[6]); +<<<<<<< HEAD #endif +======= +#endif +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) __m512 float_val0 = _mm512_cvtepi32_ps(cvtepu8_epi32(int_val0)); __m512 float_val1 = _mm512_cvtepi32_ps(cvtepu8_epi32(int_val1)); @@ -788,17 +1319,29 @@ struct Vectorized : public Vectorizedqi { float_vec_return_type dequantize( Vectorized scale, Vectorized zero_point) const { +<<<<<<< HEAD #if defined(_MSC_VER) && !defined(__clang__) +======= +#if defined(_MSC_VER) && !defined(__clang__) +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) __m128i int_val0 = _mm_set_epi64x(vals.m512i_u64[1], vals.m512i_u64[0]); __m128i int_val1 = _mm_set_epi64x(vals.m512i_u64[3], vals.m512i_u64[2]); __m128i int_val2 = _mm_set_epi64x(vals.m512i_u64[5], vals.m512i_u64[4]); __m128i int_val3 = _mm_set_epi64x(vals.m512i_u64[7], vals.m512i_u64[6]); +<<<<<<< HEAD #else +======= +#else +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) __m128i int_val0 = _mm_set_epi64x(vals[1], vals[0]); __m128i int_val1 = _mm_set_epi64x(vals[3], vals[2]); __m128i int_val2 = _mm_set_epi64x(vals[5], vals[4]); __m128i int_val3 = _mm_set_epi64x(vals[7], vals[6]); +<<<<<<< HEAD #endif +======= +#endif +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) __m512 float_val0 = _mm512_cvtepi32_ps(cvtepu8_epi32(int_val0)); __m512 float_val1 = _mm512_cvtepi32_ps(cvtepu8_epi32(int_val1)); @@ -826,6 +1369,7 @@ struct Vectorized : public Vectorizedqi { } Vectorized maximum(Vectorized b) const { +<<<<<<< HEAD return _mm512_max_epu8(vals, b.vals); } @@ -908,6 +1452,91 @@ struct Vectorized : public Vectorizedqi { template <> Vectorized inline maximum(const Vectorized& a, const Vectorized& b) { +======= + return _mm512_max_epu8(vals, b.vals); + } + + Vectorized minimum(Vectorized b) const { + return _mm512_min_epu8(vals, b.vals); + } + + Vectorized relu(Vectorized zero_point) const { + return maximum(zero_point); + } + + Vectorized relu6( + Vectorized zero_point, + Vectorized q_six) { + return _mm512_min_epu8(_mm512_max_epu8(vals, zero_point.vals), q_six.vals); + } + + int_vec_return_type widening_subtract(Vectorized b) const { +#if defined(_MSC_VER) && !defined(__clang__) + __m128i int_val0 = _mm_set_epi64x(vals.m512i_u64[1], vals.m512i_u64[0]); + __m128i int_val1 = _mm_set_epi64x(vals.m512i_u64[3], vals.m512i_u64[2]); + __m128i int_val2 = _mm_set_epi64x(vals.m512i_u64[5], vals.m512i_u64[4]); + __m128i int_val3 = _mm_set_epi64x(vals.m512i_u64[7], vals.m512i_u64[6]); +#else + __m128i int_val0 = _mm_set_epi64x(vals[1], vals[0]); + __m128i int_val1 = _mm_set_epi64x(vals[3], vals[2]); + __m128i int_val2 = _mm_set_epi64x(vals[5], vals[4]); + __m128i int_val3 = _mm_set_epi64x(vals[7], vals[6]); +#endif + + __m512i int32_val0 = cvtepu8_epi32(int_val0); + __m512i int32_val1 = cvtepu8_epi32(int_val1); + __m512i int32_val2 = cvtepu8_epi32(int_val2); + __m512i int32_val3 = cvtepu8_epi32(int_val3); + +#if defined(_MSC_VER) && !defined(__clang__) + __m128i int_b0 = _mm_set_epi64x(b.vals.m512i_u64[1], b.vals.m512i_u64[0]); + __m128i int_b1 = _mm_set_epi64x(b.vals.m512i_u64[3], b.vals.m512i_u64[2]); + __m128i int_b2 = _mm_set_epi64x(b.vals.m512i_u64[5], b.vals.m512i_u64[4]); + __m128i int_b3 = _mm_set_epi64x(b.vals.m512i_u64[7], b.vals.m512i_u64[6]); +#else + __m128i int_b0 = _mm_set_epi64x(b.vals[1], b.vals[0]); + __m128i int_b1 = _mm_set_epi64x(b.vals[3], b.vals[2]); + __m128i int_b2 = _mm_set_epi64x(b.vals[5], b.vals[4]); + __m128i int_b3 = _mm_set_epi64x(b.vals[7], b.vals[6]); +#endif + + __m512i int32_b0 = cvtepu8_epi32(int_b0); + __m512i int32_b1 = cvtepu8_epi32(int_b1); + __m512i int32_b2 = cvtepu8_epi32(int_b2); + __m512i int32_b3 = cvtepu8_epi32(int_b3); + + __m512i res_0 = _mm512_sub_epi32(int32_val0, int32_b0); + __m512i res_1 = _mm512_sub_epi32(int32_val1, int32_b1); + __m512i res_2 = _mm512_sub_epi32(int32_val2, int32_b2); + __m512i res_3 = _mm512_sub_epi32(int32_val3, int32_b3); + return { + Vectorized(res_0), + Vectorized(res_1), + Vectorized(res_2), + Vectorized(res_3)}; + } + + static Vectorized requantize_from_int( + const int_vec_return_type& inp, + float multiplier, + int32_t zero_point) { + __m512 multiplier_v = _mm512_set1_ps(multiplier); + __m512i zero_point_v = _mm512_set1_epi32(zero_point); + return RequantizeAvx512(inp, multiplier_v, zero_point_v); + } + + private: + // Load from memory constructor + Vectorized(const void* ptr) { + vals = _mm512_loadu_si512((const __m512i*)ptr); + } +}; + +template <> +Vectorized inline maximum( + const Vectorized& a, + const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return a.maximum(b); } @@ -964,7 +1593,12 @@ struct VectorizedQuantizedConverter { tmp_vals[j] = at::native::dequantize_val( scale[j], zero_point[j], T(vals[16 * i + j])); } +<<<<<<< HEAD rv[i] = Vectorized(tmp_vals[0], +======= + rv[i] = Vectorized( + tmp_vals[0], +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) tmp_vals[1], tmp_vals[2], tmp_vals[3], @@ -996,11 +1630,22 @@ struct VectorizedQuantizedConverter { }; template <> +<<<<<<< HEAD struct Vectorized : public VectorizedQuantizedConverter< c10::qint32, std::array, 1>, std::array, 1>, 16> { +======= +struct is_vec_specialized_for : std::bool_constant {}; + +template <> +struct Vectorized : public VectorizedQuantizedConverter< + c10::qint32, + std::array, 1>, + std::array, 1>, + 16> { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) Vectorized() : VectorizedQuantizedConverter< c10::qint32, @@ -1026,6 +1671,7 @@ struct Vectorized : public VectorizedQuantizedConverter< static Vectorized loadu(const void* ptr, int64_t count) { __at_align__ value_type tmp_values[size()]; +<<<<<<< HEAD // Ensure uninitialized memory does not change the output value See https://github.com/pytorch/pytorch/issues/32502 // for more details. We do not initialize arrays to zero using "={0}" because gcc would compile it to two // instructions while a loop would be compiled to one instruction. @@ -1033,6 +1679,19 @@ struct Vectorized : public VectorizedQuantizedConverter< tmp_values[i] = 0; } std::memcpy(tmp_values, reinterpret_cast(ptr), count * sizeof(value_type)); +======= + // Ensure uninitialized memory does not change the output value See + // https://github.com/pytorch/pytorch/issues/32502 for more details. We do + // not initialize arrays to zero using "={0}" because gcc would compile it + // to two instructions while a loop would be compiled to one instruction. + for (const auto i : c10::irange(size())) { + tmp_values[i] = 0; + } + std::memcpy( + tmp_values, + reinterpret_cast(ptr), + count * sizeof(value_type)); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return loadu(tmp_values); } @@ -1074,11 +1733,18 @@ struct Vectorized : public VectorizedQuantizedConverter< return retval; } +<<<<<<< HEAD Vectorized relu(Vectorized zero_point) const { return maximum(zero_point); } +======= + Vectorized relu(Vectorized zero_point) const { + return maximum(zero_point); + } + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) Vectorized relu6( Vectorized zero_point, Vectorized q_six) { @@ -1113,7 +1779,13 @@ struct Vectorized : public VectorizedQuantizedConverter< }; template <> +<<<<<<< HEAD Vectorized inline maximum(const Vectorized& a, const Vectorized& b) { +======= +Vectorized inline maximum( + const Vectorized& a, + const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return a.maximum(b); } @@ -1140,11 +1812,22 @@ Vectorized inline operator+( } template <> +<<<<<<< HEAD struct Vectorized : public VectorizedQuantizedConverter< c10::qint8, std::array, 4>, std::array, 4>, 64> { +======= +struct is_vec_specialized_for : std::bool_constant {}; + +template <> +struct Vectorized : public VectorizedQuantizedConverter< + c10::qint8, + std::array, 4>, + std::array, 4>, + 64> { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) Vectorized() : VectorizedQuantizedConverter< c10::qint8, @@ -1170,6 +1853,7 @@ struct Vectorized : public VectorizedQuantizedConverter< static Vectorized loadu(const void* ptr, int64_t count) { __at_align__ value_type tmp_values[size()]; +<<<<<<< HEAD // Ensure uninitialized memory does not change the output value See https://github.com/pytorch/pytorch/issues/32502 // for more details. We do not initialize arrays to zero using "={0}" because gcc would compile it to two // instructions while a loop would be compiled to one instruction. @@ -1177,6 +1861,19 @@ struct Vectorized : public VectorizedQuantizedConverter< tmp_values[i] = 0; } std::memcpy(tmp_values, reinterpret_cast(ptr), count * sizeof(value_type)); +======= + // Ensure uninitialized memory does not change the output value See + // https://github.com/pytorch/pytorch/issues/32502 for more details. We do + // not initialize arrays to zero using "={0}" because gcc would compile it + // to two instructions while a loop would be compiled to one instruction. + for (const auto i : c10::irange(size())) { + tmp_values[i] = 0; + } + std::memcpy( + tmp_values, + reinterpret_cast(ptr), + count * sizeof(value_type)); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return loadu(tmp_values); } @@ -1267,16 +1964,33 @@ struct Vectorized : public VectorizedQuantizedConverter< }; template <> +<<<<<<< HEAD Vectorized inline maximum(const Vectorized& a, const Vectorized& b) { +======= +Vectorized inline maximum( + const Vectorized& a, + const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return a.maximum(b); } template <> +<<<<<<< HEAD struct Vectorized : public VectorizedQuantizedConverter< c10::quint8, std::array, 4>, std::array, 4>, 64> { +======= +struct is_vec_specialized_for : std::bool_constant {}; + +template <> +struct Vectorized : public VectorizedQuantizedConverter< + c10::quint8, + std::array, 4>, + std::array, 4>, + 64> { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) Vectorized() : VectorizedQuantizedConverter< c10::quint8, @@ -1302,6 +2016,7 @@ struct Vectorized : public VectorizedQuantizedConverter< static Vectorized loadu(const void* ptr, int64_t count) { __at_align__ value_type tmp_values[size()]; +<<<<<<< HEAD // Ensure uninitialized memory does not change the output value See https://github.com/pytorch/pytorch/issues/32502 // for more details. We do not initialize arrays to zero using "={0}" because gcc would compile it to two // instructions while a loop would be compiled to one instruction. @@ -1309,6 +2024,19 @@ struct Vectorized : public VectorizedQuantizedConverter< tmp_values[i] = 0; } std::memcpy(tmp_values, reinterpret_cast(ptr), count * sizeof(value_type)); +======= + // Ensure uninitialized memory does not change the output value See + // https://github.com/pytorch/pytorch/issues/32502 for more details. We do + // not initialize arrays to zero using "={0}" because gcc would compile it + // to two instructions while a loop would be compiled to one instruction. + for (const auto i : c10::irange(size())) { + tmp_values[i] = 0; + } + std::memcpy( + tmp_values, + reinterpret_cast(ptr), + count * sizeof(value_type)); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return loadu(tmp_values); } @@ -1354,7 +2082,10 @@ struct Vectorized : public VectorizedQuantizedConverter< return maximum(zero_point); } +<<<<<<< HEAD +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) Vectorized relu6( Vectorized zero_point, Vectorized q_six) { @@ -1400,10 +2131,22 @@ struct Vectorized : public VectorizedQuantizedConverter< }; template <> +<<<<<<< HEAD Vectorized inline maximum(const Vectorized& a, const Vectorized& b) { +======= +Vectorized inline maximum( + const Vectorized& a, + const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return a.maximum(b); } #endif // defined(CPU_CAPABILITY_AVX512) && !defined(MSVC) +<<<<<<< HEAD }}} +======= +} // namespace CPU_CAPABILITY +} // namespace vec +} // namespace at +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) diff --git a/aten/src/ATen/cpu/vec/vec_base.h b/aten/src/ATen/cpu/vec/vec_base.h index 2591338881ae..6f9b24d08089 100644 --- a/aten/src/ATen/cpu/vec/vec_base.h +++ b/aten/src/ATen/cpu/vec/vec_base.h @@ -1,5 +1,10 @@ #pragma once +<<<<<<< HEAD #if defined(__GNUC__) && __GNUC__ == 10 && __GNUC_MINOR__ <= 2 && defined(__ARM_FEATURE_SVE) +======= +#if defined(__GNUC__) && __GNUC__ == 10 && __GNUC_MINOR__ <= 2 && \ + defined(__ARM_FEATURE_SVE) +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // Workaround for https: //gcc.gnu.org/bugzilla/show_bug.cgi?id=117161 #pragma GCC optimize("no-tree-vectorize") #endif @@ -18,6 +23,7 @@ // See https://github.com/pytorch/pytorch/issues/37577 for an instance // of this bug in the past. +<<<<<<< HEAD #include #include #include @@ -39,6 +45,29 @@ #include #include #include +======= +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #if defined(__GNUC__) #define __FORCE_INLINE __attribute__((always_inline)) inline @@ -66,7 +95,12 @@ Windows llvm will not have this definition. #endif #define VECTOR_WIDTH 64 #define int_vector __m512i +<<<<<<< HEAD #elif defined(__aarch64__) && !defined(CPU_CAPABILITY_SVE) // CPU_CAPABILITY_AVX512 +======= +#elif defined(__aarch64__) && \ + !defined(CPU_CAPABILITY_SVE) // CPU_CAPABILITY_AVX512 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // SVE code expects 256-vectors; leave that set for SVE? #if defined(__GNUC__) #define __at_align__ __attribute__((aligned(16))) @@ -93,6 +127,7 @@ namespace at::vec { inline namespace CPU_CAPABILITY { // at::Half and at::BFloat16 should be treated as floating point template +<<<<<<< HEAD struct is_floating_point: std::integral_constant || @@ -118,15 +153,52 @@ struct is_8bit_integer: std::integral_constant || std::is_same_v> { +======= +struct is_floating_point + : std::integral_constant< + bool, + std::is_floating_point_v || std::is_same_v || + std::is_same_v> {}; + +template +constexpr bool is_floating_point_v = is_floating_point::value; + +template +struct is_reduced_floating_point + : std::integral_constant< + bool, + std::is_same_v || std::is_same_v> {}; + +template +constexpr bool is_reduced_floating_point_v = + is_reduced_floating_point::value; + +template +struct is_8bit_integer + : std::integral_constant< + bool, + std::is_same_v || std::is_same_v> { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) }; template constexpr bool is_8bit_integer_v = is_8bit_integer::value; +<<<<<<< HEAD template struct int_of_size; #define DEFINE_INT_OF_SIZE(int_t) \ template<> struct int_of_size { using type = int_t; } +======= +template +struct int_of_size; + +#define DEFINE_INT_OF_SIZE(int_t) \ + template <> \ + struct int_of_size { \ + using type = int_t; \ + } +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) DEFINE_INT_OF_SIZE(int64_t); DEFINE_INT_OF_SIZE(int32_t); @@ -138,18 +210,55 @@ DEFINE_INT_OF_SIZE(int8_t); template using int_same_size_t = typename int_of_size::type; +<<<<<<< HEAD // NOTE: If you specialize on a type, you must define all operations! // emulates Vectorized types #if defined(__s390x__) template +======= +/** + * Detect at compile time whether Vectorized has an explicit + * specialization for T. (You are required to specialize this type + * whenever you specialize Vectorized). Useful for generic algorithms + * to decide whether to rely on a specialization being fast. For + * example, they might choose to handle reduced-precision floating + * point types directly if they're supported, or convert through float + * if not. + */ +#if defined(__s390x__) +template +#else +template +#endif +struct is_vec_specialized_for : std::bool_constant { +}; + +template +constexpr bool is_vec_specialized_for_v = is_vec_specialized_for::value; + +// NOTE: If you specialize Vectorized on a type, you must define all +// operations! You must also specialize is_vec_specialized_for for +// that type. + +// emulates Vectorized types +#if defined(__s390x__) +template +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #else template #endif struct Vectorized { +<<<<<<< HEAD private: __at_align__ T values[VECTOR_WIDTH / sizeof(T)]; public: +======= + private: + __at_align__ T values[VECTOR_WIDTH / sizeof(T)]; + + public: +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) using value_type = T; using size_type = int; @@ -163,11 +272,19 @@ struct Vectorized { values[i] = val; } } +<<<<<<< HEAD template> Vectorized(Args... vals) : values{vals...}{ } Vectorized(const T(&arr)[kSize]) { +======= + template < + typename... Args, + typename = std::enable_if_t<(sizeof...(Args) == size())>> + Vectorized(Args... vals) : values{vals...} {} + Vectorized(const T (&arr)[kSize]) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) std::memcpy(values, arr, sizeof(values)); } // This also implies const T& operator[](int idx) const @@ -198,20 +315,39 @@ struct Vectorized { } // Workaround for https: //gcc.gnu.org/bugzilla/show_bug.cgi?id=117001 #if __GNUC__ <= 12 && !defined(__clang__) && defined(__ARM_FEATURE_SVE) +<<<<<<< HEAD static Vectorized __attribute__ ((optimize("-fno-tree-loop-vectorize"))) blendv(const Vectorized& a, #else static Vectorized blendv(const Vectorized& a, #endif const Vectorized& b, const Vectorized& mask) { +======= + static Vectorized __attribute__((optimize("-fno-tree-loop-vectorize"))) + blendv( + const Vectorized& a, +#else + static Vectorized blendv( + const Vectorized& a, +#endif + const Vectorized& b, + const Vectorized& mask) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) Vectorized vector; int_same_size_t buffer[size()]; mask.store(buffer); #if defined(__clang__) && __ARM_FEATURE_SVE +<<<<<<< HEAD #pragma clang loop vectorize(disable) #endif for (const auto i : c10::irange(size())) { if (buffer[i] & 0x01) { +======= +#pragma clang loop vectorize(disable) +#endif + for (const auto i : c10::irange(size())) { + if (buffer[i] & 0x01) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) vector[i] = b[i]; } else { vector[i] = a[i]; @@ -219,15 +355,30 @@ struct Vectorized { } return vector; } +<<<<<<< HEAD template // step sometimes requires a higher precision type (e.g., T=int, step_t=double) static Vectorized arange(T base = static_cast(0), step_t step = static_cast(1)) { +======= + template // step sometimes requires a higher precision type + // (e.g., T=int, step_t=double) + static Vectorized arange( + T base = static_cast(0), + step_t step = static_cast(1)) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) Vectorized vector; for (const auto i : c10::irange(size())) { vector.values[i] = base + i * step; } return vector; } +<<<<<<< HEAD static Vectorized set(const Vectorized& a, const Vectorized& b, int64_t count = size()) { +======= + static Vectorized set( + const Vectorized& a, + const Vectorized& b, + int64_t count = size()) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) Vectorized vector; for (const auto i : c10::irange(size())) { if (i < count) { @@ -249,7 +400,13 @@ struct Vectorized { return vector; } static Vectorized loadu_one_fourth(const void* ptr) { +<<<<<<< HEAD static_assert(std::is_same_v || std::is_same_v, "For byte types only"); +======= + static_assert( + std::is_same_v || std::is_same_v, + "For byte types only"); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return Vectorized::loadu(ptr, 8); } @@ -257,9 +414,16 @@ struct Vectorized { std::memcpy(ptr, values, count * sizeof(T)); } int zero_mask() const { +<<<<<<< HEAD // returns an integer mask where all zero elements are translated to 1-bit and others are translated to 0-bit int mask = 0; for (int i = 0; i < size(); ++ i) { +======= + // returns an integer mask where all zero elements are translated to 1-bit + // and others are translated to 0-bit + int mask = 0; + for (int i = 0; i < size(); ++i) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if (values[i] == static_cast(0)) { mask |= (1 << i); } @@ -279,15 +443,28 @@ struct Vectorized { } bool has_inf_nan() const { for (int64_t i = 0; i != size(); i++) { +<<<<<<< HEAD if(_isnan(values[i]) || _isinf(values[i])) { +======= + if (_isnan(values[i]) || _isinf(values[i])) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return true; } } return false; } +<<<<<<< HEAD // MSVC versions between 14.36 and 14.42 has a loop unrolling bug on Windows Arm64 // See https://developercommunity.visualstudio.com/t/MSVC-loop-unrolling-problem-194033813-/10720692 #if defined(_WIN32) && defined(__aarch64__) && ((_MSVC_VER >= 1936) && (_MSVC_VER <= 1942)) +======= +// MSVC versions between 14.36 and 14.42 has a loop unrolling bug on Windows +// Arm64 +// See +// https://developercommunity.visualstudio.com/t/MSVC-loop-unrolling-problem-194033813-/10720692 +#if defined(_WIN32) && defined(__aarch64__) && \ + ((_MSVC_VER >= 1936) && (_MSVC_VER <= 1942)) +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) Vectorized map(T (*const f)(T)) const { Vectorized ret; for (int64_t i = 0; i < size(); i++) { @@ -322,27 +499,45 @@ struct Vectorized { return ret; } #endif +<<<<<<< HEAD Vectorized map(T (*const f)(const T &)) const { +======= + Vectorized map(T (*const f)(const T&)) const { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) Vectorized ret; for (int64_t i = 0; i != size(); i++) { ret[i] = f(values[i]); } return ret; } +<<<<<<< HEAD T reduce(T (*const f)(const T &)) const { +======= + T reduce(T (*const f)(const T&)) const { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) T ret = 0; for (int64_t i = 0; i != size(); i++) { ret = f(ret, values[i]); } return ret; } +<<<<<<< HEAD template && !c10::is_complex::value, int> = 0> +======= + template < + typename other_t_abs = T, + typename std::enable_if_t< + !is_floating_point_v && + !c10::is_complex::value, + int> = 0> +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) Vectorized abs() const { // other_t_abs is for SFINAE and clarity. Make sure it is not changed. static_assert(std::is_same_v, "other_t_abs must be T"); return map([](T x) -> T { return x < static_cast(0) ? -x : x; }); } +<<<<<<< HEAD template , int> = 0> Vectorized abs() const { @@ -354,6 +549,21 @@ struct Vectorized { } template ::value, int> = 0> +======= + template < + typename float_t_abs = T, + typename std::enable_if_t, int> = 0> + Vectorized abs() const { + // float_t_abs is for SFINAE and clarity. Make sure it is not changed. + static_assert(std::is_same_v, "float_t_abs must be T"); + // Specifically deal with floating-point because the generic code above + // won't handle -0.0 (which should result in 0.0) properly. + return map([](T x) -> T { return std::abs(x); }); + } + template < + typename complex_t_abs = T, + typename std::enable_if_t::value, int> = 0> +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) Vectorized abs() const { // complex_t_abs is for SFINAE and clarity. Make sure it is not changed. static_assert(std::is_same_v, "complex_t_abs must be T"); @@ -361,12 +571,19 @@ struct Vectorized { return map([](T x) { return static_cast(std::abs(x)); }); } +<<<<<<< HEAD template ::value, int> = 0> +======= + template < + typename other_t_sgn = T, + typename std::enable_if_t::value, int> = 0> +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) Vectorized sgn() const { return map(at::native::sgn_impl); } +<<<<<<< HEAD template ::value, int> = 0> Vectorized angle() const { @@ -383,11 +600,37 @@ struct Vectorized { } template ::value, int> = 0> +======= + template < + typename other_t_angle = T, + typename std::enable_if_t::value, int> = + 0> + Vectorized angle() const { + // other_t_angle is for SFINAE and clarity. Make sure it is not changed. + static_assert(std::is_same_v, "other_t_angle must be T"); + return map(at::native::angle_impl); // compiler is unable to resolve the + // overload without + } + template < + typename complex_t_angle = T, + typename std::enable_if_t::value, int> = + 0> + Vectorized angle() const { + // complex_t_angle is for SFINAE and clarity. Make sure it is not changed. + static_assert( + std::is_same_v, "complex_t_angle must be T"); + return map([](T x) { return static_cast(std::arg(x)); }); + } + template < + typename other_t_real = T, + typename std::enable_if_t::value, int> = 0> +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) Vectorized real() const { // other_t_real is for SFINAE and clarity. Make sure it is not changed. static_assert(std::is_same_v, "other_t_real must be T"); return *this; } +<<<<<<< HEAD template ::value, int> = 0> Vectorized real() const { @@ -397,11 +640,27 @@ struct Vectorized { } template ::value, int> = 0> +======= + template < + typename complex_t_real = T, + typename std::enable_if_t::value, int> = + 0> + Vectorized real() const { + // complex_t_real is for SFINAE and clarity. Make sure it is not changed. + static_assert( + std::is_same_v, "complex_t_real must be T"); + return map([](T x) { return static_cast(x.real()); }); + } + template < + typename other_t_imag = T, + typename std::enable_if_t::value, int> = 0> +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) Vectorized imag() const { // other_t_imag is for SFINAE and clarity. Make sure it is not changed. static_assert(std::is_same_v, "other_t_imag must be T"); return Vectorized(0); } +<<<<<<< HEAD template ::value, int> = 0> Vectorized imag() const { @@ -411,16 +670,42 @@ struct Vectorized { } template ::value, int> = 0> +======= + template < + typename complex_t_imag = T, + typename std::enable_if_t::value, int> = + 0> + Vectorized imag() const { + // complex_t_imag is for SFINAE and clarity. Make sure it is not changed. + static_assert( + std::is_same_v, "complex_t_imag must be T"); + return map([](T x) { return static_cast(x.imag()); }); + } + template < + typename other_t_conj = T, + typename std::enable_if_t::value, int> = 0> +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) Vectorized conj() const { // other_t_conj is for SFINAE and clarity. Make sure it is not changed. static_assert(std::is_same_v, "other_t_conj must be T"); return *this; } +<<<<<<< HEAD template ::value, int> = 0> Vectorized conj() const { // complex_t_conj is for SFINAE and clarity. Make sure it is not changed. static_assert(std::is_same_v, "complex_t_conj must be T"); +======= + template < + typename complex_t_conj = T, + typename std::enable_if_t::value, int> = + 0> + Vectorized conj() const { + // complex_t_conj is for SFINAE and clarity. Make sure it is not changed. + static_assert( + std::is_same_v, "complex_t_conj must be T"); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return map([](T x) { return static_cast(std::conj(x)); }); } Vectorized acos() const { @@ -441,7 +726,11 @@ struct Vectorized { Vectorized atanh() const { return map(std::atanh); } +<<<<<<< HEAD Vectorized atan2(const Vectorized &exp) const { +======= + Vectorized atan2(const Vectorized& exp) const { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) Vectorized ret; for (const auto i : c10::irange(size())) { ret[i] = std::atan2(values[i], exp[i]); @@ -449,9 +738,15 @@ struct Vectorized { return ret; } template < +<<<<<<< HEAD typename U = T, typename std::enable_if_t, int> = 0> Vectorized copysign(const Vectorized &sign) const { +======= + typename U = T, + typename std::enable_if_t, int> = 0> + Vectorized copysign(const Vectorized& sign) const { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) Vectorized ret; for (size_type i = 0; i < size(); i++) { ret[i] = c10::copysign(values[i], sign[i]); @@ -483,8 +778,13 @@ struct Vectorized { return *this - this->trunc(); } template < +<<<<<<< HEAD typename U = T, typename std::enable_if_t, int> = 0> +======= + typename U = T, + typename std::enable_if_t, int> = 0> +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) Vectorized fmod(const Vectorized& q) const { // U is for SFINAE purposes only. Make sure it is not changed. static_assert(std::is_same_v, "U must be T"); @@ -503,13 +803,20 @@ struct Vectorized { Vectorized log1p() const { return map(std::log1p); } +<<<<<<< HEAD template ::value, int> = 0> +======= + template < + typename other_t_log2 = T, + typename std::enable_if_t::value, int> = 0> +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) Vectorized log2() const { // other_t_log2 is for SFINAE and clarity. Make sure it is not changed. static_assert(std::is_same_v, "other_t_log2 must be T"); return map(std::log2); } +<<<<<<< HEAD template ::value, int> = 0> Vectorized log2() const { @@ -517,6 +824,18 @@ struct Vectorized { static_assert(std::is_same_v, "complex_t_log2 must be T"); const T log_2 = T(std::log(2.0)); return Vectorized(map(std::log))/Vectorized(log_2); +======= + template < + typename complex_t_log2 = T, + typename std::enable_if_t::value, int> = + 0> + Vectorized log2() const { + // complex_t_log2 is for SFINAE and clarity. Make sure it is not changed. + static_assert( + std::is_same_v, "complex_t_log2 must be T"); + const T log_2 = T(std::log(2.0)); + return Vectorized(map(std::log)) / Vectorized(log_2); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } Vectorized ceil() const { return map(at::native::ceil_impl); @@ -530,7 +849,11 @@ struct Vectorized { Vectorized floor() const { return map(at::native::floor_impl); } +<<<<<<< HEAD Vectorized hypot(const Vectorized &b) const { +======= + Vectorized hypot(const Vectorized& b) const { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) Vectorized ret; for (const auto i : c10::irange(size())) { ret[i] = std::hypot(values[i], b[i]); @@ -546,14 +869,22 @@ struct Vectorized { Vectorized digamma() const { return map(calc_digamma); } +<<<<<<< HEAD Vectorized igamma(const Vectorized &x) const { +======= + Vectorized igamma(const Vectorized& x) const { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) Vectorized ret; for (const auto i : c10::irange(size())) { ret[i] = calc_igamma(values[i], x[i]); } return ret; } +<<<<<<< HEAD Vectorized igammac(const Vectorized &x) const { +======= + Vectorized igammac(const Vectorized& x) const { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) Vectorized ret; for (const auto i : c10::irange(size())) { ret[i] = calc_igammac(values[i], x[i]); @@ -566,7 +897,11 @@ struct Vectorized { // promotion return map([](T x) -> T { return -x; }); } +<<<<<<< HEAD Vectorized nextafter(const Vectorized &b) const { +======= + Vectorized nextafter(const Vectorized& b) const { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) Vectorized ret; for (const auto i : c10::irange(size())) { ret[i] = std::nextafter(values[i], b[i]); @@ -574,7 +909,12 @@ struct Vectorized { return ret; } Vectorized round() const { +<<<<<<< HEAD // We do not use std::round because we would like to round midway numbers to the nearest even integer. +======= + // We do not use std::round because we would like to round midway numbers to + // the nearest even integer. +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return map(at::native::round_impl); } Vectorized sin() const { @@ -604,20 +944,33 @@ struct Vectorized { Vectorized rsqrt() const { return map([](T x) { return (T)1 / std::sqrt(x); }); } +<<<<<<< HEAD Vectorized pow(const Vectorized &exp) const { +======= + Vectorized pow(const Vectorized& exp) const { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) Vectorized ret; for (const auto i : c10::irange(size())) { ret[i] = std::pow(values[i], exp[i]); } return ret; } +<<<<<<< HEAD T reduce_add() const { +======= + T reduce_add() const { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return reduce([](T x, T y) -> T { return x + y; }); } T reduce_max() const { return reduce(std::max); } +<<<<<<< HEAD private: +======= + + private: +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) template inline Vectorized binary_pred(const Vectorized& other, Op op) const { // All bits are set to 1 if the pred is true, otherwise 0. @@ -632,6 +985,7 @@ struct Vectorized { return vector; } +<<<<<<< HEAD public: Vectorized operator==(const Vectorized& other) const { return binary_pred(other, std::equal_to()); } Vectorized operator!=(const Vectorized& other) const { return binary_pred(other, std::not_equal_to()); } @@ -646,11 +1000,41 @@ struct Vectorized { // 1 if the pred is true, otherwise 0. Vectorized vector; for (int i = 0; i != size(); ++ i) { +======= + public: + Vectorized operator==(const Vectorized& other) const { + return binary_pred(other, std::equal_to()); + } + Vectorized operator!=(const Vectorized& other) const { + return binary_pred(other, std::not_equal_to()); + } + Vectorized operator>=(const Vectorized& other) const { + return binary_pred(other, std::greater_equal()); + } + Vectorized operator<=(const Vectorized& other) const { + return binary_pred(other, std::less_equal()); + } + Vectorized operator>(const Vectorized& other) const { + return binary_pred(other, std::greater()); + } + Vectorized operator<(const Vectorized& other) const { + return binary_pred(other, std::less()); + } + + private: + template + inline Vectorized binary_pred_bool(const Vectorized& other, Op op) + const { + // 1 if the pred is true, otherwise 0. + Vectorized vector; + for (int i = 0; i != size(); ++i) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) vector[i] = static_cast(op(values[i], other.values[i])); } return vector; } +<<<<<<< HEAD public: Vectorized eq(const Vectorized& other) const { return binary_pred_bool(other, std::equal_to()); } Vectorized ne(const Vectorized& other) const { return binary_pred_bool(other, std::not_equal_to()); } @@ -661,6 +1045,55 @@ struct Vectorized { }; template Vectorized inline operator+(const Vectorized &a, const Vectorized &b) { +======= + public: + Vectorized eq(const Vectorized& other) const { + return binary_pred_bool(other, std::equal_to()); + } + Vectorized ne(const Vectorized& other) const { + return binary_pred_bool(other, std::not_equal_to()); + } + Vectorized gt(const Vectorized& other) const { + return binary_pred_bool(other, std::greater()); + } + Vectorized ge(const Vectorized& other) const { + return binary_pred_bool(other, std::greater_equal()); + } + Vectorized lt(const Vectorized& other) const { + return binary_pred_bool(other, std::less()); + } + Vectorized le(const Vectorized& other) const { + return binary_pred_bool(other, std::less_equal()); + } +}; + +template +Vectorized inline operator-(const Vectorized& a) { + return a.neg(); +} + +// There is an implicit conversion that would make this work if +// these operators weren't template functions, but they are template +// functions (and can't be moved to be non-member friends defined in +// the class body as suggested in +// https://stackoverflow.com/questions/9787593/implicit-type-conversion-with-template/9788255#9788255 +// because we have a lot of disparate specializations of +// Vectorized). So, just explicitly make scalars work. +#define VECTORIZED_SUPPORT_SCALARS_FOR_BINARY_FUNC(name) \ + template \ + Vectorized inline name(const Vectorized& a, T b) { \ + return name(a, Vectorized(b)); \ + } \ + template \ + Vectorized inline name(T a, const Vectorized& b) { \ + return name(Vectorized(a), b); \ + } +#define VECTORIZED_SUPPORT_SCALARS_FOR_BINARY_OP(op) \ + VECTORIZED_SUPPORT_SCALARS_FOR_BINARY_FUNC(operator op) + +template +Vectorized inline operator+(const Vectorized& a, const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) Vectorized c; for (int i = 0; i != Vectorized::size(); i++) { c[i] = a[i] + b[i]; @@ -668,7 +1101,14 @@ template Vectorized inline operator+(const Vectorized &a, const return c; } +<<<<<<< HEAD template Vectorized inline operator-(const Vectorized &a, const Vectorized &b) { +======= +VECTORIZED_SUPPORT_SCALARS_FOR_BINARY_OP(+) + +template +Vectorized inline operator-(const Vectorized& a, const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) Vectorized c; for (int i = 0; i != Vectorized::size(); i++) { c[i] = a[i] - b[i]; @@ -676,7 +1116,14 @@ template Vectorized inline operator-(const Vectorized &a, const return c; } +<<<<<<< HEAD template Vectorized inline operator*(const Vectorized &a, const Vectorized &b) { +======= +VECTORIZED_SUPPORT_SCALARS_FOR_BINARY_OP(-) + +template +Vectorized inline operator*(const Vectorized& a, const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) Vectorized c; for (int i = 0; i != Vectorized::size(); i++) { c[i] = a[i] * b[i]; @@ -684,7 +1131,15 @@ template Vectorized inline operator*(const Vectorized &a, const return c; } +<<<<<<< HEAD template Vectorized inline operator/(const Vectorized &a, const Vectorized &b) __ubsan_ignore_float_divide_by_zero__ { +======= +VECTORIZED_SUPPORT_SCALARS_FOR_BINARY_OP(*) + +template +Vectorized inline operator/(const Vectorized& a, const Vectorized& b) + __ubsan_ignore_float_divide_by_zero__ { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) Vectorized c; for (int i = 0; i != Vectorized::size(); i++) { c[i] = a[i] / b[i]; @@ -692,6 +1147,7 @@ template Vectorized inline operator/(const Vectorized &a, const return c; } +<<<<<<< HEAD template , int> = 0> Vectorized inline operator%(const Vectorized &a, const Vectorized &b) __ubsan_ignore_float_divide_by_zero__ { @@ -700,6 +1156,22 @@ Vectorized inline operator%(const Vectorized &a, const Vectorized &b) _ template Vectorized inline operator||( const Vectorized &a, const Vectorized &b) { +======= +VECTORIZED_SUPPORT_SCALARS_FOR_BINARY_OP(/) + +template , int> = 0> +Vectorized inline operator%(const Vectorized& a, const Vectorized& b) + __ubsan_ignore_float_divide_by_zero__ { + return a - a / b * b; +} + +VECTORIZED_SUPPORT_SCALARS_FOR_BINARY_OP(%) + +template +Vectorized inline operator||( + const Vectorized& a, + const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) Vectorized c; for (int i = 0; i != Vectorized::size(); i++) { c[i] = a[i] || b[i]; @@ -707,11 +1179,22 @@ template Vectorized inline operator||( return c; } +<<<<<<< HEAD // Implements the IEEE 754 201X `maximum` operation, which propagates NaN if // either input is a NaN. template ::value, int> = 0> Vectorized inline maximum(const Vectorized &a, const Vectorized &b) { +======= +VECTORIZED_SUPPORT_SCALARS_FOR_BINARY_OP(||) + +// Implements the IEEE 754 201X `maximum` operation, which propagates NaN if +// either input is a NaN. +template < + class T, + typename std::enable_if_t::value, int> = 0> +Vectorized inline maximum(const Vectorized& a, const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) Vectorized c; for (int i = 0; i != Vectorized::size(); i++) { c[i] = (a[i] > b[i]) ? a[i] : b[i]; @@ -725,9 +1208,16 @@ Vectorized inline maximum(const Vectorized &a, const Vectorized &b) { return c; } +<<<<<<< HEAD template ::value, int> = 0> Vectorized inline maximum(const Vectorized &a, const Vectorized &b) { +======= +template < + class T, + typename std::enable_if_t::value, int> = 0> +Vectorized inline maximum(const Vectorized& a, const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) Vectorized c; for (int i = 0; i != Vectorized::size(); i++) { c[i] = (std::abs(a[i]) > std::abs(b[i])) ? a[i] : b[i]; @@ -741,11 +1231,22 @@ Vectorized inline maximum(const Vectorized &a, const Vectorized &b) { return c; } +<<<<<<< HEAD // Implements the IEEE 754 201X `minimum` operation, which propagates NaN if // either input is a NaN. template ::value, int> = 0> Vectorized inline minimum(const Vectorized &a, const Vectorized &b) { +======= +VECTORIZED_SUPPORT_SCALARS_FOR_BINARY_FUNC(maximum) + +// Implements the IEEE 754 201X `minimum` operation, which propagates NaN if +// either input is a NaN. +template < + class T, + typename std::enable_if_t::value, int> = 0> +Vectorized inline minimum(const Vectorized& a, const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) Vectorized c; for (int i = 0; i != Vectorized::size(); i++) { c[i] = (a[i] < b[i]) ? a[i] : b[i]; @@ -759,9 +1260,16 @@ Vectorized inline minimum(const Vectorized &a, const Vectorized &b) { return c; } +<<<<<<< HEAD template ::value, int> = 0> Vectorized inline minimum(const Vectorized &a, const Vectorized &b) { +======= +template < + class T, + typename std::enable_if_t::value, int> = 0> +Vectorized inline minimum(const Vectorized& a, const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) Vectorized c; for (int i = 0; i != Vectorized::size(); i++) { c[i] = (std::abs(a[i]) < std::abs(b[i])) ? a[i] : b[i]; @@ -775,9 +1283,21 @@ Vectorized inline minimum(const Vectorized &a, const Vectorized &b) { return c; } +<<<<<<< HEAD template ::value, int> = 0> Vectorized inline clamp(const Vectorized &a, const Vectorized &min_vec, const Vectorized &max_vec) { +======= +VECTORIZED_SUPPORT_SCALARS_FOR_BINARY_FUNC(minimum) + +template < + class T, + typename std::enable_if_t::value, int> = 0> +Vectorized inline clamp( + const Vectorized& a, + const Vectorized& min_vec, + const Vectorized& max_vec) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) Vectorized c; for (int i = 0; i != Vectorized::size(); i++) { c[i] = std::min(std::max(a[i], min_vec[i]), max_vec[i]); @@ -785,9 +1305,54 @@ Vectorized inline clamp(const Vectorized &a, const Vectorized &min_vec, return c; } +<<<<<<< HEAD template ::value, int> = 0> Vectorized inline clamp_max(const Vectorized &a, const Vectorized &max_vec) { +======= +#define VECTORIZED_SUPPORT_SCALARS_FOR_TERNARY_FUNC(name) \ + template \ + Vectorized inline name( \ + const Vectorized& a, const Vectorized& b, T c) { \ + return name(a, b, Vectorized(c)); \ + } \ + \ + template \ + Vectorized inline name( \ + const Vectorized& a, T b, const Vectorized& c) { \ + return name(a, Vectorized(b), c); \ + } \ + \ + template \ + Vectorized inline name(const Vectorized& a, T b, T c) { \ + return name(a, Vectorized(b), Vectorized(c)); \ + } \ + \ + template \ + Vectorized inline name( \ + T a, const Vectorized& b, const Vectorized& c) { \ + return name(Vectorized(a), b, c); \ + } \ + \ + template \ + Vectorized inline name(T a, const Vectorized& b, T c) { \ + return name(Vectorized(a), b, Vectorized(c)); \ + } \ + \ + template \ + Vectorized inline name(T a, T b, const Vectorized& c) { \ + return name(Vectorized(a), Vectorized(b), c); \ + } + +VECTORIZED_SUPPORT_SCALARS_FOR_TERNARY_FUNC(clamp) + +template < + class T, + typename std::enable_if_t::value, int> = 0> +Vectorized inline clamp_max( + const Vectorized& a, + const Vectorized& max_vec) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) Vectorized c; for (int i = 0; i != Vectorized::size(); i++) { c[i] = a[i] > max_vec[i] ? max_vec[i] : a[i]; @@ -795,9 +1360,20 @@ Vectorized inline clamp_max(const Vectorized &a, const Vectorized &max_ return c; } +<<<<<<< HEAD template ::value, int> = 0> Vectorized inline clamp_min(const Vectorized &a, const Vectorized &min_vec) { +======= +VECTORIZED_SUPPORT_SCALARS_FOR_BINARY_FUNC(clamp_max) + +template < + class T, + typename std::enable_if_t::value, int> = 0> +Vectorized inline clamp_min( + const Vectorized& a, + const Vectorized& min_vec) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) Vectorized c; for (int i = 0; i != Vectorized::size(); i++) { c[i] = a[i] < min_vec[i] ? min_vec[i] : a[i]; @@ -805,10 +1381,16 @@ Vectorized inline clamp_min(const Vectorized &a, const Vectorized &min_ return c; } +<<<<<<< HEAD +======= +VECTORIZED_SUPPORT_SCALARS_FOR_BINARY_FUNC(clamp_min) + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) struct Vectorizedi; #if defined(CPU_CAPABILITY_AVX2) || defined(CPU_CAPABILITY_AVX512) template +<<<<<<< HEAD static inline Vectorized bitwise_binary_op(const Vectorized &a, const Vectorized &b, Op op) { int_vector buffer; #if defined(CPU_CAPABILITY_AVX2) @@ -817,6 +1399,23 @@ static inline Vectorized bitwise_binary_op(const Vectorized &a, const Vect #elif defined(CPU_CAPABILITY_AVX512) int_vector a_buffer = _mm512_load_si512(reinterpret_cast((const T*)a)); int_vector b_buffer = _mm512_load_si512(reinterpret_cast((const T*)b)); +======= +static inline Vectorized bitwise_binary_op( + const Vectorized& a, + const Vectorized& b, + Op op) { + int_vector buffer; +#if defined(CPU_CAPABILITY_AVX2) + int_vector a_buffer = + _mm256_load_si256(reinterpret_cast((const T*)a)); + int_vector b_buffer = + _mm256_load_si256(reinterpret_cast((const T*)b)); +#elif defined(CPU_CAPABILITY_AVX512) + int_vector a_buffer = + _mm512_load_si512(reinterpret_cast((const T*)a)); + int_vector b_buffer = + _mm512_load_si512(reinterpret_cast((const T*)b)); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #endif buffer = op(a_buffer, b_buffer); __at_align__ T results[Vectorized::size()]; @@ -829,6 +1428,7 @@ static inline Vectorized bitwise_binary_op(const Vectorized &a, const Vect return Vectorized::loadu(results); } +<<<<<<< HEAD template>::value, int> = 0> inline Vectorized operator&(const Vectorized& a, const Vectorized& b) { // We enclose _mm512_and_si512 or _mm256_and_si256 with lambda because it is always_inline @@ -854,6 +1454,54 @@ inline Vectorized operator^(const Vectorized& a, const Vectorized& b) { return bitwise_binary_op(a, b, [](int_vector a, int_vector b) { return _mm256_xor_si256(a, b); }); #elif defined(CPU_CAPABILITY_AVX512) return bitwise_binary_op(a, b, [](int_vector a, int_vector b) { return _mm512_xor_si512(a, b); }); +======= +template < + class T, + typename std::enable_if_t< + !std::is_base_of>::value, + int> = 0> +inline Vectorized operator&(const Vectorized& a, const Vectorized& b) { + // We enclose _mm512_and_si512 or _mm256_and_si256 with lambda because it is + // always_inline +#if defined(CPU_CAPABILITY_AVX2) + return bitwise_binary_op( + a, b, [](int_vector a, int_vector b) { return _mm256_and_si256(a, b); }); +#elif defined(CPU_CAPABILITY_AVX512) + return bitwise_binary_op( + a, b, [](int_vector a, int_vector b) { return _mm512_and_si512(a, b); }); +#endif +} +template < + class T, + typename std::enable_if_t< + !std::is_base_of>::value, + int> = 0> +inline Vectorized operator|(const Vectorized& a, const Vectorized& b) { + // We enclose _mm512_or_si512 or _mm256_or_si256 with lambda because it is + // always_inline +#if defined(CPU_CAPABILITY_AVX2) + return bitwise_binary_op( + a, b, [](int_vector a, int_vector b) { return _mm256_or_si256(a, b); }); +#elif defined(CPU_CAPABILITY_AVX512) + return bitwise_binary_op( + a, b, [](int_vector a, int_vector b) { return _mm512_or_si512(a, b); }); +#endif +} +template < + class T, + typename std::enable_if_t< + !std::is_base_of>::value, + int> = 0> +inline Vectorized operator^(const Vectorized& a, const Vectorized& b) { + // We enclose _mm512_xor_si512 or _mm256_xor_si256 with lambda because it is + // always_inline +#if defined(CPU_CAPABILITY_AVX2) + return bitwise_binary_op( + a, b, [](int_vector a, int_vector b) { return _mm256_xor_si256(a, b); }); +#elif defined(CPU_CAPABILITY_AVX512) + return bitwise_binary_op( + a, b, [](int_vector a, int_vector b) { return _mm512_xor_si512(a, b); }); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #endif } @@ -866,12 +1514,28 @@ auto load(char const* data) -> T { return ret; } +<<<<<<< HEAD template static inline Vectorized bitwise_binary_op(const Vectorized &a, const Vectorized &b, Op op) { static constexpr uint32_t element_no = VECTOR_WIDTH / sizeof(intmax_t); __at_align__ intmax_t buffer[element_no]; static_assert(VECTOR_WIDTH % sizeof(intmax_t) == 0, "VECTOR_WIDTH not a multiple of sizeof(intmax_t)"); static_assert(sizeof(buffer) == sizeof(Vectorized), "sizeof(buffer) must match sizeof(Vectorized)"); +======= +template +static inline Vectorized bitwise_binary_op( + const Vectorized& a, + const Vectorized& b, + Op op) { + static constexpr uint32_t element_no = VECTOR_WIDTH / sizeof(intmax_t); + __at_align__ intmax_t buffer[element_no]; + static_assert( + VECTOR_WIDTH % sizeof(intmax_t) == 0, + "VECTOR_WIDTH not a multiple of sizeof(intmax_t)"); + static_assert( + sizeof(buffer) == sizeof(Vectorized), + "sizeof(buffer) must match sizeof(Vectorized)"); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // We should be using memcpy in order to respect the strict aliasing rule // see: https://github.com/pytorch/pytorch/issues/66119 // Using char* is defined in the C11 standard 6.5 Expression paragraph 7 @@ -889,6 +1553,7 @@ static inline Vectorized bitwise_binary_op(const Vectorized &a, const Vect return Vectorized::loadu(buffer); } +<<<<<<< HEAD template>, int> = 0> inline Vectorized operator&(const Vectorized& a, const Vectorized& b) { return bitwise_binary_op(a, b, std::bit_and()); @@ -898,12 +1563,33 @@ inline Vectorized operator|(const Vectorized& a, const Vectorized& b) { return bitwise_binary_op(a, b, std::bit_or()); } template>, int> = 0> +======= +template < + class T, + typename std:: + enable_if_t>, int> = 0> +inline Vectorized operator&(const Vectorized& a, const Vectorized& b) { + return bitwise_binary_op(a, b, std::bit_and()); +} +template < + class T, + typename std:: + enable_if_t>, int> = 0> +inline Vectorized operator|(const Vectorized& a, const Vectorized& b) { + return bitwise_binary_op(a, b, std::bit_or()); +} +template < + class T, + typename std:: + enable_if_t>, int> = 0> +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) inline Vectorized operator^(const Vectorized& a, const Vectorized& b) { return bitwise_binary_op(a, b, std::bit_xor()); } #endif // defined(CPU_CAPABILITY_AVX2) || defined(CPU_CAPABILITY_AVX512) +<<<<<<< HEAD template>, int> = 0> inline Vectorized operator~(const Vectorized& a) { using int_t = int_same_size_t; @@ -912,11 +1598,36 @@ inline Vectorized operator~(const Vectorized& a) { } template Vectorized inline operator<<(const Vectorized &a, const Vectorized &b) { +======= +VECTORIZED_SUPPORT_SCALARS_FOR_BINARY_OP(&) +VECTORIZED_SUPPORT_SCALARS_FOR_BINARY_OP(|) +VECTORIZED_SUPPORT_SCALARS_FOR_BINARY_OP(^) + +template < + class T, + typename std:: + enable_if_t>, int> = 0> +inline Vectorized operator~(const Vectorized& a) { + using int_t = int_same_size_t; + Vectorized ones(c10::bit_cast((int_t)(~(int_t)0))); // All bits are 1 + return a ^ ones; +} + +template +Vectorized inline operator<<( + const Vectorized& a, + const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) constexpr T max_shift = sizeof(T) * CHAR_BIT; Vectorized c; for (int i = 0; i != Vectorized::size(); i++) { T shift = b[i]; +<<<<<<< HEAD if ((static_cast>(shift) < 0) || (shift >= max_shift)) { +======= + if ((static_cast>(shift) < 0) || + (shift >= max_shift)) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) c[i] = 0; } else { c[i] = static_cast>(a[i]) << shift; @@ -925,13 +1636,25 @@ template Vectorized inline operator<<(const Vectorized &a, const return c; } +<<<<<<< HEAD template Vectorized inline operator>>(const Vectorized &a, const Vectorized &b) { +======= +template +Vectorized inline operator>>( + const Vectorized& a, + const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // right shift value to retain sign bit for signed and no bits for unsigned constexpr T max_shift = sizeof(T) * CHAR_BIT - std::is_signed_v; Vectorized c; for (int i = 0; i != Vectorized::size(); i++) { T shift = b[i]; +<<<<<<< HEAD if ((static_cast>(shift) < 0) || (shift >= max_shift)) { +======= + if ((static_cast>(shift) < 0) || + (shift >= max_shift)) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) c[i] = a[i] >> max_shift; } else { c[i] = a[i] >> shift; @@ -941,44 +1664,73 @@ template Vectorized inline operator>>(const Vectorized &a, const } template +<<<<<<< HEAD inline Vectorized& operator += (Vectorized& a, const Vectorized& b) { +======= +inline Vectorized& operator+=(Vectorized& a, const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) a = a + b; return a; } template +<<<<<<< HEAD inline Vectorized& operator -= (Vectorized& a, const Vectorized& b) { +======= +inline Vectorized& operator-=(Vectorized& a, const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) a = a - b; return a; } template +<<<<<<< HEAD inline Vectorized& operator /= (Vectorized& a, const Vectorized& b) { +======= +inline Vectorized& operator/=(Vectorized& a, const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) a = a / b; return a; } template +<<<<<<< HEAD inline Vectorized& operator %= (Vectorized& a, const Vectorized& b) { +======= +inline Vectorized& operator%=(Vectorized& a, const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) a = a % b; return a; } template +<<<<<<< HEAD inline Vectorized& operator *= (Vectorized& a, const Vectorized& b) { +======= +inline Vectorized& operator*=(Vectorized& a, const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) a = a * b; return a; } template +<<<<<<< HEAD inline Vectorized& operator <<= (Vectorized& a, const Vectorized& b) { +======= +inline Vectorized& operator<<=(Vectorized& a, const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) a = a << b; return a; } template +<<<<<<< HEAD inline Vectorized& operator >>= (Vectorized& a, const Vectorized& b) { +======= +inline Vectorized& operator>>=(Vectorized& a, const Vectorized& b) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) a = a >> b; return a; } template +<<<<<<< HEAD inline Vectorized fmadd(const Vectorized& a, const Vectorized& b, const Vectorized& c) { return a * b + c; } @@ -988,6 +1740,27 @@ inline Vectorized fmsub(const Vectorized& a, const Vectorized& b, const return a * b - c; } +======= +inline Vectorized fmadd( + const Vectorized& a, + const Vectorized& b, + const Vectorized& c) { + return a * b + c; +} + +VECTORIZED_SUPPORT_SCALARS_FOR_TERNARY_FUNC(fmadd) + +template +inline Vectorized fmsub( + const Vectorized& a, + const Vectorized& b, + const Vectorized& c) { + return a * b - c; +} + +VECTORIZED_SUPPORT_SCALARS_FOR_TERNARY_FUNC(fmsub) + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) template Vectorized inline operator&&( const Vectorized& a, @@ -999,9 +1772,19 @@ Vectorized inline operator&&( return ret; } +<<<<<<< HEAD template std::enable_if_t> inline gather(T const* base_addr, const Vectorized>& vindex) { +======= +VECTORIZED_SUPPORT_SCALARS_FOR_BINARY_OP(&&) + +template +std::enable_if_t< + scale == 1 || scale == 2 || scale == 4 || scale == 8, + Vectorized< + T>> inline gather(T const* base_addr, const Vectorized>& vindex) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) static constexpr int size = Vectorized::size(); int_same_size_t index_arr[size]; vindex.store(static_cast(index_arr)); @@ -1013,36 +1796,65 @@ inline gather(T const* base_addr, const Vectorized>& vindex) } template +<<<<<<< HEAD std::enable_if_t> inline mask_gather(const Vectorized& src, T const* base_addr, const Vectorized>& vindex, Vectorized& mask) { static constexpr int size = Vectorized::size(); T src_arr[size]; int_same_size_t mask_arr[size]; // use int type so we can logical and +======= +std:: + enable_if_t> inline mask_gather( + const Vectorized& src, + T const* base_addr, + const Vectorized>& vindex, + Vectorized& mask) { + static constexpr int size = Vectorized::size(); + T src_arr[size]; + int_same_size_t mask_arr[size]; // use int type so we can logical and +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) int_same_size_t index_arr[size]; src.store(static_cast(src_arr)); mask.store(static_cast(mask_arr)); vindex.store(static_cast(index_arr)); T buffer[size]; for (const auto i : c10::irange(size)) { +<<<<<<< HEAD if (mask_arr[i] & 0x01) { // check highest bit +======= + if (mask_arr[i] & 0x01) { // check highest bit +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) buffer[i] = base_addr[index_arr[i] * scale / sizeof(T)]; } else { buffer[i] = src_arr[i]; } } +<<<<<<< HEAD mask = Vectorized(static_cast(0)); // "zero out" mask +======= + mask = Vectorized(static_cast(0)); // "zero out" mask +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return Vectorized::loadu(static_cast(buffer)); } // Cast a given vector to another type without changing the bits representation. // So a Vectorized of 512 bits containing all ones can be cast to a +<<<<<<< HEAD // Vectorized of 512 bits containing all ones (i.e., eight negative 1s). // A Vec of 256 bits containing all ones can be cast to a // Vec of 256 bits containing all ones (i.e., four negative 1s). // There is a struct here because we don't have static_if and I can't // partially specialize a templated function. template +======= +// Vectorized of 512 bits containing all ones (i.e., eight negative +// 1s). A Vec of 256 bits containing all ones can be cast to a +// Vec of 256 bits containing all ones (i.e., four negative 1s). +// There is a struct here because we don't have static_if and I can't +// partially specialize a templated function. +template +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) struct CastImpl { static inline Vectorized apply(const Vectorized& src) { src_t src_arr[Vectorized::size()]; @@ -1051,19 +1863,28 @@ struct CastImpl { } }; +<<<<<<< HEAD template +======= +template +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) struct CastImpl { static inline Vectorized apply(const Vectorized& src) { return src; } }; +<<<<<<< HEAD template +======= +template +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) inline Vectorized cast(const Vectorized& src) { return CastImpl::apply(src); } template > +<<<<<<< HEAD inline Vectorized convert_to_int_of_same_size(const Vectorized& src) { static_assert(sizeof(T) == sizeof(IntType)); static constexpr int size = Vectorized::size(); @@ -1073,22 +1894,52 @@ inline Vectorized convert_to_int_of_same_size(const Vectorized& src) std::array buffer; std::transform(src_arr.cbegin(), src_arr.cend(), buffer.begin(), [](const T& x) { return static_cast(x); }); +======= +inline Vectorized convert_to_int_of_same_size( + const Vectorized& src) { + static_assert(sizeof(T) == sizeof(IntType)); + static constexpr int size = Vectorized::size(); + + std::array src_arr = {}; + src.store(static_cast(src_arr.data())); + std::array buffer; + std::transform( + src_arr.cbegin(), src_arr.cend(), buffer.begin(), [](const T& x) { + return static_cast(x); + }); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return Vectorized::loadu(static_cast(buffer.data())); } template > +<<<<<<< HEAD inline Vectorized convert_to_fp_of_same_size(const Vectorized& src) { +======= +inline Vectorized convert_to_fp_of_same_size( + const Vectorized& src) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) static_assert(sizeof(T) == sizeof(IntType)); static constexpr int size = Vectorized::size(); std::array src_arr; src.store(static_cast(src_arr.data())); std::array buffer; +<<<<<<< HEAD std::transform(src_arr.cbegin(), src_arr.cend(), buffer.begin(), [](const IntType& x) { return static_cast(x); }); return Vectorized::loadu(static_cast(buffer.data())); } +======= + std::transform( + src_arr.cbegin(), src_arr.cend(), buffer.begin(), [](const IntType& x) { + return static_cast(x); + }); + return Vectorized::loadu(static_cast(buffer.data())); +} + +// clang-format off +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // Example inputs for AVX512: // a Vectorized = {a0, b0, a1, b1, a2, b2, a3, b3, a4, b4, a5, b5, a6, b6, a7, b7} // b Vectorized = {a8, b8, a9, b9, a10, b10, a11, b11, a12, b12, a13, b13, a14, b14, a15, b15} @@ -1099,8 +1950,16 @@ inline Vectorized convert_to_fp_of_same_size(const Vectorized& src) // b Vectorized = {a4, b4, a5, b5, a6, b6, a7, b7} // returns: Vectorized = {a0, a1, a2, a3, a4, a5, a6, a7} // Vectorized = {b0, b1, b2, b3, b4, b5, b6, b7} +<<<<<<< HEAD template inline std::enable_if_t::size() % 2 == 0, std::pair, Vectorized>> +======= +// clang-format on +template +inline std::enable_if_t< + Vectorized::size() % 2 == 0, + std::pair, Vectorized>> +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) deinterleave2(const Vectorized& a, const Vectorized& b) { static constexpr int size = Vectorized::size(); static constexpr int half_size = size / 2; @@ -1116,10 +1975,21 @@ deinterleave2(const Vectorized& a, const Vectorized& b) { buffer2[i] = a_arr[i * 2 + 1]; buffer2[half_size + i] = b_arr[i * 2 + 1]; } +<<<<<<< HEAD return std::make_pair(Vectorized::loadu(static_cast(buffer1)), Vectorized::loadu(static_cast(buffer2))); } +======= + return std::make_pair( + Vectorized::loadu(static_cast(buffer1)), + Vectorized::loadu(static_cast(buffer2))); +} + +VECTORIZED_SUPPORT_SCALARS_FOR_BINARY_FUNC(deinterleave2) + +// clang-format off +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // inverse operation of deinterleave2 // Example inputs for AVX512: // a Vectorized = {a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, a14, a15} @@ -1131,8 +2001,16 @@ deinterleave2(const Vectorized& a, const Vectorized& b) { // b Vectorized = {b0, b1, b2, b3, b4, b5, b6, b7} // returns: Vectorized = {a0, b0, a1, b1, a2, b2, a3, b3} // Vectorized = {a4, b4, a5, b5, a6, b6, a7, b7} +<<<<<<< HEAD template inline std::enable_if_t::size() % 2 == 0, std::pair, Vectorized>> +======= +// clang-format on +template +inline std::enable_if_t< + Vectorized::size() % 2 == 0, + std::pair, Vectorized>> +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) interleave2(const Vectorized& a, const Vectorized& b) { static constexpr int size = Vectorized::size(); static constexpr int half_size = size / 2; @@ -1148,6 +2026,7 @@ interleave2(const Vectorized& a, const Vectorized& b) { buffer2[i * 2] = a_arr[half_size + i]; buffer2[i * 2 + 1] = b_arr[half_size + i]; } +<<<<<<< HEAD return std::make_pair(Vectorized::loadu(static_cast(buffer1)), Vectorized::loadu(static_cast(buffer2))); } @@ -1156,6 +2035,23 @@ template inline void convert(const src_T *src, dst_T *dst, int64_t n) { #ifndef _MSC_VER # pragma unroll +======= + return std::make_pair( + Vectorized::loadu(static_cast(buffer1)), + Vectorized::loadu(static_cast(buffer2))); +} + +VECTORIZED_SUPPORT_SCALARS_FOR_BINARY_FUNC(interleave2) + +#undef VECTORIZED_SUPPORT_SCALARS_FOR_BINARY_FUNC +#undef VECTORIZED_SUPPORT_SCALARS_FOR_BINARY_OP +#undef VECTORIZED_SUPPORT_SCALARS_FOR_TERNARY_FUNC + +template +inline void convert(const src_T* src, dst_T* dst, int64_t n) { +#ifndef _MSC_VER +#pragma unroll +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #endif for ([[maybe_unused]] const auto i : c10::irange(n)) { *dst = c10::convert(c10::load(src)); @@ -1165,7 +2061,11 @@ inline void convert(const src_T *src, dst_T *dst, int64_t n) { } template +<<<<<<< HEAD inline Vectorized flip(const Vectorized & data) { +======= +inline Vectorized flip(const Vectorized& data) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) static constexpr int size = Vectorized::size(); T output[size]; T buffer[size]; @@ -1176,6 +2076,7 @@ inline Vectorized flip(const Vectorized & data) { return Vectorized::loadu(static_cast(output)); } +<<<<<<< HEAD // Transpose the `src` buffer of type `T` and size (M,N) into the `dst` buffer. `ld_src` is the leading // dimension of `src` and `ld_dst` is the leading dimension of `dst`. template @@ -1183,11 +2084,28 @@ inline void transpose_mxn(const T* src, int64_t ld_src, T* dst, int64_t ld_dst, for (int i = 0; i < M; i++) { for (int j = 0; j < N; j++) { dst[j*ld_dst + i] = src[i*ld_src + j]; +======= +// Transpose the `src` buffer of type `T` and size (M,N) into the `dst` buffer. +// `ld_src` is the leading dimension of `src` and `ld_dst` is the leading +// dimension of `dst`. +template +inline void transpose_mxn( + const T* src, + int64_t ld_src, + T* dst, + int64_t ld_dst, + int M, + int N) { + for (int i = 0; i < M; i++) { + for (int j = 0; j < N; j++) { + dst[j * ld_dst + i] = src[i * ld_src + j]; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } } } template +<<<<<<< HEAD inline void transpose_mxn(const T* src, int64_t ld_src, T* dst, int64_t ld_dst) { transpose_mxn(src, ld_src, dst, ld_dst, M, N); } @@ -1198,3 +2116,20 @@ inline void transpose_mxn(const T* src, int64_t ld_src, T* dst, int64_t ld_dst) #include #include #include +======= +inline void transpose_mxn( + const T* src, + int64_t ld_src, + T* dst, + int64_t ld_dst) { + transpose_mxn(src, ld_src, dst, ld_dst, M, N); +} + +} // namespace CPU_CAPABILITY +} // namespace at::vec + +// additional headers for more operations that depend on vec_base +#include +#include +#include +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) diff --git a/aten/src/ATen/cpu/vec/vec_convert.h b/aten/src/ATen/cpu/vec/vec_convert.h index a5cee03dabcf..98ad0cab93fb 100644 --- a/aten/src/ATen/cpu/vec/vec_convert.h +++ b/aten/src/ATen/cpu/vec/vec_convert.h @@ -28,8 +28,13 @@ struct VecConvert { }; template +<<<<<<< HEAD inline std::enable_if_t, Vectorized> convert(const Vectorized& src) { +======= +inline std::enable_if_t, Vectorized> convert( + const Vectorized& src) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return src; } @@ -62,4 +67,21 @@ convert(const VectorizedN& src) { } } // namespace CPU_CAPABILITY +<<<<<<< HEAD +======= + +template < + typename scalar_t, + typename std::enable_if_t, int> = 0> +inline std::tuple, Vectorized> convert_to_float( + const Vectorized&); + +template < + typename scalar_t, + typename std::enable_if_t, int> = 0> +inline Vectorized convert_from_float( + const Vectorized&, + const Vectorized&); + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } // namespace at::vec diff --git a/aten/src/ATen/cpu/vec/vec_half.h b/aten/src/ATen/cpu/vec/vec_half.h index c7c90cc95b47..67aedf6551f2 100644 --- a/aten/src/ATen/cpu/vec/vec_half.h +++ b/aten/src/ATen/cpu/vec/vec_half.h @@ -103,7 +103,13 @@ static inline void transpose_pad_2x32_block( _mm512_storeu_si512(reinterpret_cast<__m512i*>(dst + 32), d1); } #else +<<<<<<< HEAD TORCH_CHECK(false, "transpose_pad_2x32_block is only supported when avx512 is supported") +======= + TORCH_CHECK( + false, + "transpose_pad_2x32_block is only supported when avx512 is supported") +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #endif } @@ -124,16 +130,27 @@ static inline void pack_vnni2( for (; bk < _K; bk += 2) { int64_t bn = 0; for (; bn < _N; bn += 32) { +<<<<<<< HEAD transpose_pad_2x32_block(src + bk * ld_src + bn, dst + bk * N + bn * 2, ld_src); } int64_t nrem = N - bn; if (nrem > 0) { transpose_pad_2x32_block(src + bk * ld_src + bn, dst + bk * N + bn * 2, ld_src, 2, nrem); +======= + transpose_pad_2x32_block( + src + bk * ld_src + bn, dst + bk * N + bn * 2, ld_src); + } + int64_t nrem = N - bn; + if (nrem > 0) { + transpose_pad_2x32_block( + src + bk * ld_src + bn, dst + bk * N + bn * 2, ld_src, 2, nrem); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } } if (K % 2 == 1) { int64_t bn = 0; for (; bn < _N; bn += 32) { +<<<<<<< HEAD transpose_pad_2x32_block(src + bk * ld_src + bn, dst + bk * N + bn * 2, ld_src, 1); } int64_t nrem = N - bn; @@ -147,5 +164,21 @@ TORCH_CHECK(false, "pack_vnni2 is only supported when avx512 is supported") } +======= + transpose_pad_2x32_block( + src + bk * ld_src + bn, dst + bk * N + bn * 2, ld_src, 1); + } + int64_t nrem = N - bn; + if (nrem > 0) { + transpose_pad_2x32_block( + src + bk * ld_src + bn, dst + bk * N + bn * 2, ld_src, 1, nrem); + } + } +#else + TORCH_CHECK(false, "pack_vnni2 is only supported when avx512 is supported") +#endif +} + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } // namespace CPU_CAPABILITY } // namespace at::vec diff --git a/aten/src/ATen/cpu/vec/vec_mask.h b/aten/src/ATen/cpu/vec/vec_mask.h index c547e5911ecb..b1a0cf0441c0 100644 --- a/aten/src/ATen/cpu/vec/vec_mask.h +++ b/aten/src/ATen/cpu/vec/vec_mask.h @@ -68,7 +68,16 @@ struct VecMaskTo { } }; +<<<<<<< HEAD template +======= +template < + typename dst_t, + int dst_n, + typename src_t, + int src_n, + typename Enabled = void> +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) struct VecMaskCast { static inline VecMask apply( const VecMask& vec_mask) { @@ -88,15 +97,27 @@ struct VecMaskCheck { static inline bool all_zero(const VectorizedN& vec_mask) { __at_align__ T mask[VectorizedN::size()]; vec_mask.store(mask); +<<<<<<< HEAD return std::all_of( mask, mask + VectorizedN::size(), [](T m) { return m == static_cast(0); }); +======= + return std::all_of(mask, mask + VectorizedN::size(), [](T m) { + return m == static_cast(0); + }); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } static inline bool all_masked(const VectorizedN& vec_mask) { __at_align__ T mask[VectorizedN::size()]; vec_mask.store(mask); +<<<<<<< HEAD return std::all_of( mask, mask + VectorizedN::size(), [](T m) { return m != static_cast(0); }); +======= + return std::all_of(mask, mask + VectorizedN::size(), [](T m) { + return m != static_cast(0); + }); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } static inline bool is_masked(const VectorizedN& vec_mask, int i) { @@ -159,6 +180,7 @@ class VecMask { } static VecMask blendv( +<<<<<<< HEAD const VecMask& c, const VecMask& b, const VecMask& a) { @@ -166,6 +188,13 @@ class VecMask { VectorizedN(c), VectorizedN(b), VectorizedN(a)); +======= + const VecMask& c, + const VecMask& b, + const VecMask& a) { + VectorizedN result = VectorizedN::blendv( + VectorizedN(c), VectorizedN(b), VectorizedN(a)); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return result; } @@ -174,14 +203,24 @@ class VecMask { const VecMask& b, int64_t count = size()) { VectorizedN result = VectorizedN::set( +<<<<<<< HEAD VectorizedN(a), VectorizedN(b), count); +======= + VectorizedN(a), VectorizedN(b), count); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return result; } void store(bool* b, int count = size()) { +<<<<<<< HEAD constexpr int L = (VectorizedN::size() + Vectorized::size() - 1)/ Vectorized::size(); +======= + constexpr int L = + (VectorizedN::size() + Vectorized::size() - 1) / + Vectorized::size(); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) auto res = this->to(); res.store(b, count); return; diff --git a/aten/src/ATen/cuda/Atomic.cuh b/aten/src/ATen/cuda/Atomic.cuh index 5a127b4d7507..a9868dd1937b 100644 --- a/aten/src/ATen/cuda/Atomic.cuh +++ b/aten/src/ATen/cuda/Atomic.cuh @@ -330,9 +330,24 @@ inline __device__ void gpuAtomicAddNoReturn(int64_t *address, int64_t val) { gpu inline __device__ void gpuAtomicAddNoReturn(bool *address, bool val) { gpuAtomicAdd(address, val); } inline __device__ void gpuAtomicAddNoReturn(at::Half *address, at::Half val) { gpuAtomicAdd(address, val); } inline __device__ void gpuAtomicAddNoReturn(at::BFloat16 *address, at::BFloat16 val) { gpuAtomicAdd(address, val); } +<<<<<<< HEAD inline __device__ void gpuAtomicAddNoReturn(double *address, double val) { gpuAtomicAdd(address, val); } /* Special case fp32 atomic. */ +======= + +/* Note [HIP unsafeAtomicAdd] + * ~~~~~~~~~~~~~~~~~~~~~~~~~~ + * Use unsafeAtomicAdd instead of atomicAdd for fp32 and fp64. + * On HIP, atomicAdd is always correct but is a slow CAS loop. + * unsafeAtomicAdd will use HW instructions and is much faster, + * but the caller must guarantee the pointer is GPU memory. + * If the pointer is system memory, the result is a silent no-op. + * This guarantee is upheld by all PyTorch uses of unsafeAtomicAdd. + * AMD HIP atomic header file is named amd_hip_atomic.h and is + * under the LLVM compiler directory. + */ +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #if defined(USE_ROCM) inline __device__ void gpuAtomicAddNoReturn(float *address, float val) { #if defined(__gfx908__) @@ -341,8 +356,15 @@ inline __device__ void gpuAtomicAddNoReturn(float *address, float val) { (void)unsafeAtomicAdd(address, val); #endif } +<<<<<<< HEAD +#else +inline __device__ void gpuAtomicAddNoReturn(float *address, float val) { gpuAtomicAdd(address, val); } +======= +inline __device__ void gpuAtomicAddNoReturn(double *address, double val) { (void)unsafeAtomicAdd(address, val); } #else inline __device__ void gpuAtomicAddNoReturn(float *address, float val) { gpuAtomicAdd(address, val); } +inline __device__ void gpuAtomicAddNoReturn(double *address, double val) { gpuAtomicAdd(address, val); } +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #endif // Atomic multiplication implementation. @@ -399,7 +421,11 @@ template __host__ __device__ T safe_max(T a, T b) { #if defined(__HIPCC__) // TODO: remove this special case for HIP when issue is fixed: +<<<<<<< HEAD // https://github.com/ROCm-Developer-Tools/HIP/issues/2209 +======= + // https://github.com/ROCm/hip/issues/2209 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) T max = at::_isnan(a) ? a : (at::_isnan(b) ? b : std::max(a, b)); #else T max = at::_isnan(b) ? b : std::max(a, b); @@ -459,7 +485,11 @@ template __host__ __device__ T safe_min(T a, T b) { #if defined(__HIPCC__) // TODO: remove this special case for HIP when issue is fixed: +<<<<<<< HEAD // https://github.com/ROCm-Developer-Tools/HIP/issues/2209 +======= + // https://github.com/ROCm/hip/issues/2209 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) T min = at::_isnan(a) ? a : (at::_isnan(b) ? b : std::min(a, b)); #else T min = at::_isnan(b) ? b : std::min(a, b); diff --git a/aten/src/ATen/cuda/CUDABlas.cpp b/aten/src/ATen/cuda/CUDABlas.cpp index abf45deffeb9..e5b2de4e76bd 100644 --- a/aten/src/ATen/cuda/CUDABlas.cpp +++ b/aten/src/ATen/cuda/CUDABlas.cpp @@ -17,7 +17,10 @@ #include #ifdef USE_ROCM +<<<<<<< HEAD #include +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #include // until hipblas has an API to accept flags, we must use rocblas here #include @@ -112,12 +115,24 @@ static cublasOperation_t _cublasOpFromChar(char op) { // NOLINTNEXTLINE(bugprone-switch-missing-default-case) switch (op) { case 'n': +<<<<<<< HEAD case 'N': return CUBLAS_OP_N; case 't': case 'T': return CUBLAS_OP_T; case 'c': +======= + [[fallthrough]]; + case 'N': + return CUBLAS_OP_N; + case 't': + [[fallthrough]]; + case 'T': + return CUBLAS_OP_T; + case 'c': + [[fallthrough]]; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) case 'C': return CUBLAS_OP_C; } @@ -186,6 +201,7 @@ uint32_t _getAlignment(uintptr_t address) { } #endif +<<<<<<< HEAD #ifdef USE_ROCM static c10::cuda::CUDAStream _getCarveoutStream(int32_t value) { // 0 is default value, meaning full CUs i.e. no mask @@ -244,6 +260,8 @@ static void _syncCurrentWithCarveoutStream(hipStream_t stream, bool presync) { } #endif +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) struct CublasLtWorkspace { CublasLtWorkspace() { size = at::cuda::getCUDABlasLtWorkspaceSize(); @@ -252,7 +270,10 @@ struct CublasLtWorkspace { void * ptr; size_t size; }; +<<<<<<< HEAD +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } // anonymous namespace namespace at::cuda::blas { @@ -280,7 +301,10 @@ namespace at::cuda::blas { CUDABLAS_NONNEGINT_CHECK(bgemm, num_batches); \ } while (0) +<<<<<<< HEAD +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) namespace { // Following the pattern of CuSparseDescriptor // Defined here for now because this is the only place cublas_lt interface is @@ -366,11 +390,30 @@ class CuBlasLtMatmulPreference : public CuBlasLtDescriptor< } // namespace +<<<<<<< HEAD template inline void bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGTYPES(Dtype)) { cudaDataType_t abcType = CUDA_R_32F; cublasComputeType_t computeType = CUBLAS_COMPUTE_32F; cudaDataType_t scaleType = CUDA_R_32F; +======= +template +static inline bool bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(Dtype, C_Dtype)) { +#if defined(USE_ROCM) && ROCM_VERSION == 60400 + // regression in ROCm 6.4, planned fixed in 6.4.1, hipblaslt TT fp32 calculation errors + // best to disallow hipblaslt for this specific case + if constexpr (std::is_same_v) { + if (_cublasOpFromChar(transa) == CUBLAS_OP_T && _cublasOpFromChar(transb) == CUBLAS_OP_T) { + return false; + } + } +#endif + cudaDataType_t abType = CUDA_R_32F; + cudaDataType_t cType = CUDA_R_32F; + cublasComputeType_t computeType = CUBLAS_COMPUTE_32F; + cudaDataType_t scaleType = CUDA_R_32F; + CuBlasLtMatmulPreference preference; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #ifndef USE_ROCM at::Half halpha; at::Half hbeta; @@ -378,7 +421,12 @@ inline void bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGTYPES(Dtype)) { void * alpha_ptr = α void * beta_ptr = β if constexpr (std::is_same_v) { +<<<<<<< HEAD abcType = CUDA_R_64F; +======= + abType = CUDA_R_64F; + cType = CUDA_R_64F; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) computeType = CUBLAS_COMPUTE_64F; scaleType = CUDA_R_64F; } else if constexpr (std::is_same_v) { @@ -386,26 +434,60 @@ inline void bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGTYPES(Dtype)) { computeType = CUBLAS_COMPUTE_32F_FAST_TF32; } } else if constexpr (std::is_same_v>) { +<<<<<<< HEAD abcType = CUDA_C_64F; computeType = CUBLAS_COMPUTE_64F; scaleType = CUDA_C_64F; } else if constexpr (std::is_same_v>) { abcType = CUDA_C_32F; +======= + abType = CUDA_C_64F; + cType = CUDA_C_64F; + computeType = CUBLAS_COMPUTE_64F; + scaleType = CUDA_C_64F; + } else if constexpr (std::is_same_v>) { + abType = CUDA_C_32F; + cType = CUDA_C_32F; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) scaleType = CUDA_C_32F; } else if constexpr (std::is_same_v) { #ifndef USE_ROCM cudaDeviceProp* prop = at::cuda::getCurrentDeviceProperties(); if (prop->major >= 7 && at::globalContext().allowFP16AccumulationCuBLAS()) { computeType = CUBLAS_COMPUTE_16F; +<<<<<<< HEAD +======= + scaleType = CUDA_R_16F; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) halpha = alpha; hbeta = beta; alpha_ptr = &halpha; beta_ptr = &hbeta; } #endif +<<<<<<< HEAD abcType = CUDA_R_16F; } else if constexpr (std::is_same_v) { abcType = CUDA_R_16BF; +======= + abType = CUDA_R_16F; + cType = (std::is_same_v) ? CUDA_R_32F : CUDA_R_16F; +#ifndef USE_ROCM + if (!at::globalContext().allowFP16ReductionCuBLAS()) { + preference.setAttribute(CUBLASLT_MATMUL_PREF_REDUCTION_SCHEME_MASK, + CUBLASLT_REDUCTION_SCHEME_COMPUTE_TYPE | CUBLASLT_REDUCTION_SCHEME_NONE); + } +#endif + } else if constexpr (std::is_same_v) { + abType = CUDA_R_16BF; + cType = (std::is_same_v) ? CUDA_R_32F : CUDA_R_16BF; +#ifndef USE_ROCM + if (!at::globalContext().allowBF16ReductionCuBLAS()) { + preference.setAttribute(CUBLASLT_MATMUL_PREF_REDUCTION_SCHEME_MASK, + CUBLASLT_REDUCTION_SCHEME_COMPUTE_TYPE | CUBLASLT_REDUCTION_SCHEME_NONE); + } +#endif +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } else { static_assert(false && sizeof(Dtype), "at::cuda::blas::bgemm_internal_cublaslt: not implemented"); } @@ -419,7 +501,10 @@ inline void bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGTYPES(Dtype)) { CuBlasLtMatmulDescriptor computeDesc(computeType, scaleType); computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_TRANSA, opa); computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_TRANSB, opb); +<<<<<<< HEAD auto stream = at::cuda::getCurrentCUDAStream(); +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #ifndef USE_ROCM if (at::globalContext()._SMCarveout_EXPERIMENTAL().has_value()) { computeDesc.setAttribute( @@ -427,6 +512,7 @@ inline void bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGTYPES(Dtype)) { at::cuda::getCurrentDeviceProperties()->multiProcessorCount - at::globalContext()._SMCarveout_EXPERIMENTAL().value()); } +<<<<<<< HEAD #else if (at::globalContext()._SMCarveout_EXPERIMENTAL().has_value()) { stream = _getCarveoutStream( @@ -437,6 +523,12 @@ inline void bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGTYPES(Dtype)) { CuBlasLtMatrixLayout Adesc(abcType, m, k, lda, opa == CUBLAS_OP_T); CuBlasLtMatrixLayout Bdesc(abcType, k, n, ldb, opb == CUBLAS_OP_T); CuBlasLtMatrixLayout Cdesc(abcType, m, n, ldc); +======= +#endif + CuBlasLtMatrixLayout Adesc(abType, m, k, lda, opa == CUBLAS_OP_T); + CuBlasLtMatrixLayout Bdesc(abType, k, n, ldb, opb == CUBLAS_OP_T); + CuBlasLtMatrixLayout Cdesc(cType, m, n, ldc); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if (num_batches > 1) { int num_batches_as_int = static_cast(num_batches); @@ -448,8 +540,11 @@ inline void bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGTYPES(Dtype)) { Cdesc.setAttribute(CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, stridec); } +<<<<<<< HEAD CuBlasLtMatmulPreference preference; +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #ifndef USE_ROCM uint32_t a_alignment = _getAlignment(reinterpret_cast(a)); uint32_t b_alignment = _getAlignment(reinterpret_cast(b)); @@ -463,6 +558,10 @@ inline void bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGTYPES(Dtype)) { TORCH_CHECK(ltworkspace.ptr != nullptr, "OOM trying to allocate workspace for cublaslt"); preference.setAttribute(CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES, ltworkspace.size); +<<<<<<< HEAD +======= + cublasStatus_t cublasStatus = CUBLAS_STATUS_SUCCESS; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) cublasLtMatmulHeuristicResult_t heuristicResult = {}; int returnedResult = 0; TORCH_CUDABLAS_CHECK(cublasLtMatmulAlgoGetHeuristic( @@ -477,10 +576,17 @@ inline void bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGTYPES(Dtype)) { &heuristicResult, &returnedResult)); if (returnedResult == 0) { +<<<<<<< HEAD TORCH_CUDABLAS_CHECK(CUBLAS_STATUS_NOT_SUPPORTED); } cublasStatus_t cublasStatus = cublasLtMatmul( +======= + cublasStatus = CUBLAS_STATUS_NOT_SUPPORTED; + } + else { + cublasStatus = cublasLtMatmul( +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ltHandle, computeDesc.descriptor(), alpha_ptr, @@ -496,6 +602,7 @@ inline void bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGTYPES(Dtype)) { &heuristicResult.algo, ltworkspace.ptr, ltworkspace.size, +<<<<<<< HEAD stream); #ifdef USE_ROCM if (at::globalContext()._SMCarveout_EXPERIMENTAL().has_value()) { @@ -505,6 +612,13 @@ inline void bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGTYPES(Dtype)) { TORCH_CHECK( cublasStatus == CUBLAS_STATUS_SUCCESS, "CUDA error: ", +======= + at::cuda::getCurrentCUDAStream()); + } + if (cublasStatus != CUBLAS_STATUS_SUCCESS) { + TORCH_WARN( + "bgemm_internal_cublaslt error: ", +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) at::cuda::blas::_cublasGetErrorEnum(cublasStatus), " when calling cublasLtMatmul with transpose_mat1 ", (opa == CUBLAS_OP_T), @@ -522,6 +636,7 @@ inline void bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGTYPES(Dtype)) { ldb, " ldc ", ldc, +<<<<<<< HEAD " abcType ", abcType, " computeType ", @@ -534,6 +649,26 @@ inline void bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGTYPES(Dtype)) { template inline void bgemm_internal_cublas(CUDABLAS_BGEMM_ARGTYPES(Dtype)) { static_assert(false && sizeof(Dtype), "at::cuda::blas::bgemm_internal_cublas: not implemented"); +======= + " abType ", + abType, + " cType ", + cType, + " computeType ", + computeType, + " scaleType ", + scaleType, + ". Will attempt to recover by calling cublas instead."); + return false; + } + return true; +} + + +template +inline void bgemm_internal_cublas(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(Dtype, C_Dtype)) { + TORCH_CHECK(false, "at::cuda::blas::bgemm: not implemented for input type ", typeid(Dtype).name(), " and output type ", typeid(C_Dtype).name()); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } template <> @@ -592,8 +727,13 @@ void bgemm_internal_cublas>(CUDABLAS_BGEMM_ARGTYPES(c10::com reinterpret_cast(c), ldc, stridec, num_batches)); } +<<<<<<< HEAD template <> void bgemm_internal_cublas(CUDABLAS_BGEMM_ARGTYPES(at::Half)) { +======= +template +inline void bgemm_internal_cublas_half_helper(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(at::Half, C_Dtype)) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // See Note [Writing Nondeterministic Operations] globalContext().alertCuBLASConfigNotDeterministic(); cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle(); @@ -638,6 +778,7 @@ void bgemm_internal_cublas(CUDABLAS_BGEMM_ARGTYPES(at::Half)) { handle, opa, opb, m, n, k, alpha_ptr, a, CUDA_R_16F, lda, stridea, b, CUDA_R_16F, ldb, strideb, beta_ptr, +<<<<<<< HEAD c, CUDA_R_16F, ldc, stridec, num_batches, compute_type, CUBLAS_GEMM_DEFAULT_TENSOR_OP)); } else { @@ -648,13 +789,40 @@ void bgemm_internal_cublas(CUDABLAS_BGEMM_ARGTYPES(at::Half)) { alpha, (a + i * stridea), lda, (b + i * strideb), ldb, beta, (c + i * stridec), ldc); +======= + c, std::is_same_v ? CUDA_R_32F : CUDA_R_16F, ldc, stridec, + num_batches, compute_type, CUBLAS_GEMM_DEFAULT_TENSOR_OP)); + } else { + for (const auto i : c10::irange(num_batches)) { + if (std::is_same_v) { + float* c_ptr = (float*)(c + i * stridec); + at::cuda::blas::gemm( + transa, transb, + m, n, k, + alpha, (a + i * stridea), lda, + (b + i * strideb), ldb, beta, + c_ptr, ldc); + } else { + at::cuda::blas::gemm( + transa, transb, + m, n, k, + alpha, (a + i * stridea), lda, + (b + i * strideb), ldb, beta, + (c + i * stridec), ldc); + } +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } } #endif // USE_ROCM } +<<<<<<< HEAD template <> void bgemm_internal_cublas(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16)) { +======= +template +inline void bgemm_internal_cublas_bfloat16_helper(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(at::BFloat16, C_Dtype)) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // See Note [Writing Nondeterministic Operations] globalContext().alertCuBLASConfigNotDeterministic(); BGEMM_CHECK_ARGVALUES(at::BFloat16); @@ -671,6 +839,7 @@ void bgemm_internal_cublas(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16)) auto compute_type = CUDA_R_32F; #endif TORCH_CUDABLAS_CHECK(cublasGemmStridedBatchedEx(handle, +<<<<<<< HEAD opa, opb, (int)m, (int)n, (int)k, (void*)&falpha, a, CUDA_R_16BF, (int)lda, stridea, b, CUDA_R_16BF, (int)ldb, strideb, @@ -681,6 +850,40 @@ void bgemm_internal_cublas(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16)) } template <> +======= + opa, opb, (int)m, (int)n, (int)k, + (void*)&falpha, a, CUDA_R_16BF, (int)lda, stridea, + b, CUDA_R_16BF, (int)ldb, strideb, + (void*)&fbeta, c, std::is_same_v ? CUDA_R_32F : CUDA_R_16BF, + (int)ldc, stridec, (int)num_batches, + compute_type, + CUBLAS_GEMM_DEFAULT_TENSOR_OP)); +} + +template <> +void bgemm_internal_cublas(CUDABLAS_BGEMM_ARGTYPES(at::Half)) { + bgemm_internal_cublas_half_helper(CUDABLAS_BGEMM_ARGS(at::Half)); +} + +template <> +void bgemm_internal_cublas(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(at::Half, float)) { + bgemm_internal_cublas_half_helper(CUDABLAS_BGEMM_ARGS(at::Half)); +} + +template <> +void bgemm_internal_cublas(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16)) { + bgemm_internal_cublas_bfloat16_helper(CUDABLAS_BGEMM_ARGS(at::BFloat16)); +} + + +template <> +void bgemm_internal_cublas(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(at::BFloat16, float)) { + bgemm_internal_cublas_bfloat16_helper(CUDABLAS_BGEMM_ARGS(at::BFloat16)); +} + + +template <> +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) void bgemm_internal(CUDABLAS_BGEMM_ARGTYPES(double)) { if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) { @@ -688,7 +891,13 @@ void bgemm_internal(CUDABLAS_BGEMM_ARGTYPES(double)) // hipblaslt does not support double gemm yet bgemm_internal_cublas(CUDABLAS_BGEMM_ARGS(double)); #else +<<<<<<< HEAD bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGS(double)); +======= + if (!bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGS(double))) { + bgemm_internal_cublas(CUDABLAS_BGEMM_ARGS(double)); + } +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #endif } else { @@ -700,7 +909,13 @@ template <> void bgemm_internal(CUDABLAS_BGEMM_ARGTYPES(float)) { if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) { +<<<<<<< HEAD bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGS(float)); +======= + if (!bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGS(float))) { + bgemm_internal_cublas(CUDABLAS_BGEMM_ARGS(float)); + } +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } else { bgemm_internal_cublas(CUDABLAS_BGEMM_ARGS(float)); @@ -715,7 +930,13 @@ void bgemm_internal>(CUDABLAS_BGEMM_ARGTYPES(c10::complex gemm yet bgemm_internal_cublas>(CUDABLAS_BGEMM_ARGS(c10::complex)); #else +<<<<<<< HEAD bgemm_internal_cublaslt>(CUDABLAS_BGEMM_ARGS(c10::complex)); +======= + if (!bgemm_internal_cublaslt>(CUDABLAS_BGEMM_ARGS(c10::complex))) { + bgemm_internal_cublas>(CUDABLAS_BGEMM_ARGS(c10::complex)); + } +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #endif } else { @@ -731,7 +952,13 @@ void bgemm_internal>(CUDABLAS_BGEMM_ARGTYPES(c10::complex gemm yet bgemm_internal_cublas>(CUDABLAS_BGEMM_ARGS(c10::complex)); #else +<<<<<<< HEAD bgemm_internal_cublaslt>(CUDABLAS_BGEMM_ARGS(c10::complex)); +======= + if (!bgemm_internal_cublaslt>(CUDABLAS_BGEMM_ARGS(c10::complex))) { + bgemm_internal_cublas>(CUDABLAS_BGEMM_ARGS(c10::complex)); + } +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #endif } else { @@ -743,7 +970,13 @@ template <> void bgemm_internal(CUDABLAS_BGEMM_ARGTYPES(at::Half)) { if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) { +<<<<<<< HEAD bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGS(at::Half)); +======= + if (!bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGS(at::Half))) { + bgemm_internal_cublas(CUDABLAS_BGEMM_ARGS(at::Half)); + } +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } else { bgemm_internal_cublas(CUDABLAS_BGEMM_ARGS(at::Half)); @@ -754,9 +987,17 @@ template <> void bgemm_internal(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16)) { if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) { +<<<<<<< HEAD bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGS(at::BFloat16)); } #ifdef USE_ROCM +======= + if (!bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGS(at::BFloat16))) { + bgemm_internal_cublas(CUDABLAS_BGEMM_ARGS(at::BFloat16)); + } + } +#if defined(USE_ROCM) && !defined(_MSC_VER) +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) { at::native::bgemm_internal_ck(CUDABLAS_BGEMM_ARGS(at::BFloat16)); } @@ -766,9 +1007,56 @@ void bgemm_internal(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16)) } } +<<<<<<< HEAD template inline void bgemm_tunable(CUDABLAS_BGEMM_ARGTYPES(DType)) { tunable::GemmStridedBatchedParams params; +======= +template<> +void bgemm_internal(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(at::Half, float)) +{ + if (at::globalContext().allowFP16AccumulationCuBLAS()) { + // Do not allow fp16 reductions with fp32 output + TORCH_CHECK(false, "bgemm input type at::Half and output type float is not supported with allowFP16AccumulationCuBLAS"); + } + + if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) { + if (!bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGS(at::Half))) { + bgemm_internal_cublas(CUDABLAS_BGEMM_ARGS(at::Half)); + } + } +#if defined(USE_ROCM) && !defined(_MSC_VER) + else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) { + TORCH_CHECK(false, "gemm input type at::Half and output type float is not supported for ROCm"); + } +#endif + else { + bgemm_internal_cublas(CUDABLAS_BGEMM_ARGS(at::Half)); + } +} + +template<> +void bgemm_internal(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(at::BFloat16, float)) +{ + if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) { + if (!bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGS(at::BFloat16))) { + bgemm_internal_cublas(CUDABLAS_BGEMM_ARGS(at::BFloat16)); + } + } +#if defined(USE_ROCM) && !defined(_MSC_VER) + else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) { + TORCH_CHECK(false, "gemm input type at::BFloat16 and output type float is not supported for ROCm"); + } +#endif + else { + bgemm_internal_cublas(CUDABLAS_BGEMM_ARGS(at::BFloat16)); + } +} + +template +inline void bgemm_tunable(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(Dtype, C_Dtype)) { + tunable::GemmStridedBatchedParams params; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) params.transa = transa; params.transb = transb; params.m = m; @@ -791,6 +1079,7 @@ inline void bgemm_tunable(CUDABLAS_BGEMM_ARGTYPES(DType)) { bool transb_ = ((transb != 'n') && (transb != 'N')); if (transa_ && transb_) { +<<<<<<< HEAD static tunable::GemmStridedBatchedTunableOp bgemm{}; bgemm(¶ms); } @@ -804,6 +1093,21 @@ inline void bgemm_tunable(CUDABLAS_BGEMM_ARGTYPES(DType)) { } else if (!transa_ && !transb_) { static tunable::GemmStridedBatchedTunableOp bgemm{}; +======= + static tunable::GemmStridedBatchedTunableOp bgemm{}; + bgemm(¶ms); + } + else if (transa_ && !transb_) { + static tunable::GemmStridedBatchedTunableOp bgemm{}; + bgemm(¶ms); + } + else if (!transa_ && transb_) { + static tunable::GemmStridedBatchedTunableOp bgemm{}; + bgemm(¶ms); + } + else if (!transa_ && !transb_) { + static tunable::GemmStridedBatchedTunableOp bgemm{}; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) bgemm(¶ms); } else { @@ -877,6 +1181,7 @@ void bgemm(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16)) { } } +<<<<<<< HEAD template inline void gemm_internal_cublaslt(CUDABLAS_GEMM_ARGTYPES(Dtype)) { // forward to bgemm implementation but set strides and batches to 0 @@ -889,6 +1194,39 @@ inline void gemm_internal_cublas(CUDABLAS_GEMM_ARGTYPES(Dtype)) { } +======= +template <> +void bgemm(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(at::Half, float)) { + #ifdef USE_ROCM + TORCH_CHECK(false, "bgemm input type at::Half and output type float is not supported for ROCm"); + #endif + // TODO: Support tuning for Half inputs and FP32 output + bgemm_internal(CUDABLAS_BGEMM_ARGS(at::Half)); +} + + +template <> +void bgemm(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(at::BFloat16, float)) { + #ifdef USE_ROCM + TORCH_CHECK(false, "bgemm input type at::BFloat16 and output type float is not supported for ROCm"); + #else + cudaDeviceProp* prop = at::cuda::getCurrentDeviceProperties(); + + if (prop->major < 8) + TORCH_CHECK(false, "bgemm input type at::BFloat16 and output type float is only supported for CUDA devices with compute capability 8.0 or higher"); + #endif + // TODO: Support tuning for BFloat16 inputs and FP32 output + bgemm_internal(CUDABLAS_BGEMM_ARGS(at::BFloat16)); +} + + + +template +inline void gemm_internal_cublas(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(Dtype, C_Dtype)) { + TORCH_CHECK(false, "at::cuda::blas::gemm: not implemented for input type ", typeid(Dtype).name(), " and output type ", typeid(C_Dtype).name()); +} + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) template <> void gemm_internal_cublas(CUDABLAS_GEMM_ARGTYPES(double)) { // See Note [Writing Nondeterministic Operations] @@ -945,8 +1283,13 @@ void gemm_internal_cublas>(CUDABLAS_GEMM_ARGTYPES(c10::compl reinterpret_cast(c), ldc)); } +<<<<<<< HEAD template <> void gemm_internal_cublas(CUDABLAS_GEMM_ARGTYPES(at::Half)) { +======= +template +inline void gemm_internal_cublas_half_helper(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(at::Half, C_Dtype)) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // See Note [Writing Nondeterministic Operations] globalContext().alertCuBLASConfigNotDeterministic(); cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle(); @@ -1025,7 +1368,11 @@ void gemm_internal_cublas(CUDABLAS_GEMM_ARGTYPES(at::Half)) { ldb, beta_ptr, c, +<<<<<<< HEAD CUDA_R_16F, +======= + std::is_same_v ? CUDA_R_32F : CUDA_R_16F, +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ldc, compute_type, CUBLAS_GEMM_DEFAULT_TENSOR_OP)); @@ -1047,14 +1394,23 @@ void gemm_internal_cublas(CUDABLAS_GEMM_ARGTYPES(at::Half)) { ldb, &fbeta, c, +<<<<<<< HEAD CUDA_R_16F, +======= + std::is_same_v ? CUDA_R_32F : CUDA_R_16F, +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ldc)); } #endif } +<<<<<<< HEAD template <> void gemm_internal_cublas(CUDABLAS_GEMM_ARGTYPES(at::BFloat16)) { +======= +template +inline void gemm_internal_cublas_bfloat16_helper(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(at::BFloat16, C_Dtype)) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) globalContext().alertCuBLASConfigNotDeterministic(); cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle(); cublasOperation_t opa = _cublasOpFromChar(transa); @@ -1091,7 +1447,11 @@ void gemm_internal_cublas(CUDABLAS_GEMM_ARGTYPES(at::BFloat16)) { ldb, &fbeta, c, +<<<<<<< HEAD CUDA_R_16BF, +======= + std::is_same_v ? CUDA_R_32F : CUDA_R_16BF, +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ldc, compute_type, CUBLAS_GEMM_DEFAULT_TENSOR_OP)); @@ -1099,6 +1459,37 @@ void gemm_internal_cublas(CUDABLAS_GEMM_ARGTYPES(at::BFloat16)) { } template <> +<<<<<<< HEAD +======= +void gemm_internal_cublas(CUDABLAS_GEMM_ARGTYPES(at::Half)) { + gemm_internal_cublas_half_helper(CUDABLAS_GEMM_ARGS(at::Half)); +} + +template <> +void gemm_internal_cublas(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(at::Half, float)) { + gemm_internal_cublas_half_helper(CUDABLAS_GEMM_ARGS(at::Half)); +} + +template <> +void gemm_internal_cublas(CUDABLAS_GEMM_ARGTYPES(at::BFloat16)) { + gemm_internal_cublas_bfloat16_helper(CUDABLAS_GEMM_ARGS(at::BFloat16)); +} + +template <> +void gemm_internal_cublas(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(at::BFloat16, float)) { + gemm_internal_cublas_bfloat16_helper(CUDABLAS_GEMM_ARGS(at::BFloat16)); +} + +template +inline void gemm_internal_cublaslt(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(Dtype, C_Dtype)) { + // forward to bgemm implementation but set strides and batches to 0 + if (!bgemm_internal_cublaslt(transa, transb, m, n, k, alpha, a, lda, 0, b, ldb, 0, beta, c, ldc, 0, 0)) { + gemm_internal_cublas(CUDABLAS_GEMM_ARGS(Dtype)); + } +} + +template <> +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) void gemm_internal(CUDABLAS_GEMM_ARGTYPES(double)) { if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) { @@ -1109,7 +1500,11 @@ void gemm_internal(CUDABLAS_GEMM_ARGTYPES(double)) gemm_internal_cublaslt(CUDABLAS_GEMM_ARGS(double)); #endif } +<<<<<<< HEAD #ifdef USE_ROCM +======= +#if defined(USE_ROCM) && !defined(_MSC_VER) +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) { at::native::gemm_internal_ck(CUDABLAS_GEMM_ARGS(double)); } @@ -1125,9 +1520,19 @@ void gemm_internal(CUDABLAS_GEMM_ARGTYPES(float)) if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) { gemm_internal_cublaslt(CUDABLAS_GEMM_ARGS(float)); } +<<<<<<< HEAD #ifdef USE_ROCM else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) { at::native::gemm_internal_ck(CUDABLAS_GEMM_ARGS(float)); +======= +#if defined(USE_ROCM) && !defined(_MSC_VER) + else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) { + if (at::detail::getCUDAHooks().isGPUArch({"gfx1100"})) { //no CK GEMM version for gfx1100 + gemm_internal_cublaslt(CUDABLAS_GEMM_ARGS(float)); + } else{ + at::native::gemm_internal_ck(CUDABLAS_GEMM_ARGS(float)); + } +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } #endif else { @@ -1173,7 +1578,11 @@ void gemm_internal(CUDABLAS_GEMM_ARGTYPES(at::Half)) if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) { gemm_internal_cublaslt(CUDABLAS_GEMM_ARGS(at::Half)); } +<<<<<<< HEAD #ifdef USE_ROCM +======= +#if defined(USE_ROCM) && !defined(_MSC_VER) +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) { at::native::gemm_internal_ck(CUDABLAS_GEMM_ARGS(at::Half)); } @@ -1189,7 +1598,11 @@ void gemm_internal(CUDABLAS_GEMM_ARGTYPES(at::BFloat16)) if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) { gemm_internal_cublaslt(CUDABLAS_GEMM_ARGS(at::BFloat16)); } +<<<<<<< HEAD #ifdef USE_ROCM +======= +#if defined(USE_ROCM) && !defined(_MSC_VER) +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) { at::native::gemm_internal_ck(CUDABLAS_GEMM_ARGS(at::BFloat16)); } @@ -1199,8 +1612,50 @@ void gemm_internal(CUDABLAS_GEMM_ARGTYPES(at::BFloat16)) } } +<<<<<<< HEAD template inline void gemm_tunable(CUDABLAS_GEMM_ARGTYPES(DType)) { +======= +template<> +void gemm_internal(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(at::Half, float)) +{ + if (at::globalContext().allowFP16AccumulationCuBLAS()) { + // Do not allow fp16 reductions with fp32 output + TORCH_CHECK(false, "gemm input type at::Half and output type float is not supported with allowFP16AccumulationCuBLAS"); + } + + if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) { + gemm_internal_cublaslt(CUDABLAS_GEMM_ARGS(at::Half)); + } +#if defined(USE_ROCM) && !defined(_MSC_VER) + else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) { + TORCH_CHECK(false, "gemm input type at::Half and output type float is not supported for ROCm"); + } +#endif + else { + gemm_internal_cublas(CUDABLAS_GEMM_ARGS(at::Half)); + } +} + +template<> +void gemm_internal(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(at::BFloat16, float)) +{ + if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) { + gemm_internal_cublaslt(CUDABLAS_GEMM_ARGS(at::BFloat16)); + } +#if defined(USE_ROCM) && !defined(_MSC_VER) + else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) { + TORCH_CHECK(false, "gemm input type at::Half and output type float is not supported for ROCm"); + } +#endif + else { + gemm_internal_cublas(CUDABLAS_GEMM_ARGS(at::BFloat16)); + } +} + +template +inline void gemm_tunable(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(DType, C_Dtype)) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) tunable::GemmParams params; params.transa = transa; params.transb = transb; @@ -1306,9 +1761,39 @@ void gemm(CUDABLAS_GEMM_ARGTYPES(at::BFloat16)) { } } +<<<<<<< HEAD template void gemm_and_bias( +======= +template <> +void gemm(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(at::Half, float)) { + #ifdef USE_ROCM + TORCH_CHECK(false, "gemm input type at::Half and output type float is not supported for ROCm"); + #endif + // TODO: Support Tuning for fp16-fp32 gemm + gemm_internal(CUDABLAS_GEMM_ARGS(at::Half)); +} + + +template <> +void gemm(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(at::BFloat16, float)) { + #ifdef USE_ROCM + TORCH_CHECK(false, "gemm input type at::BFloat16 and output type float is not supported for ROCm"); + #else + cudaDeviceProp* prop = at::cuda::getCurrentDeviceProperties(); + + if (prop->major < 8) + TORCH_CHECK(false, "gemm input type at::BFloat16 and output type float is only supported for CUDA devices with compute capability 8.0 or higher"); + #endif + // TODO: Support Tuning for bf16-fp32 gemm + gemm_internal(CUDABLAS_GEMM_ARGS(at::BFloat16)); +} + + +template +bool gemm_and_bias( +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) bool transpose_mat1, bool transpose_mat2, int64_t m, @@ -1320,6 +1805,7 @@ void gemm_and_bias( const Dtype* mat2_ptr, int64_t mat2_ld, const Dtype* bias, +<<<<<<< HEAD Dtype* result_ptr, int64_t result_ld, GEMMAndBiasActivationEpilogue activation) { @@ -1329,6 +1815,32 @@ void gemm_and_bias( cudaDataType_t abcType = CUDA_R_32F; cublasComputeType_t computeType = CUBLAS_COMPUTE_32F; cudaDataType_t scaleType = CUDA_R_32F; +======= + C_Dtype* result_ptr, + int64_t result_ld, + GEMMAndBiasActivationEpilogue activation) { + + if (std::is_same_v && std::is_same_v) { + #ifdef USE_ROCM + TORCH_CHECK(false, "gemm input type at::BFloat16 and output type float is not supported for ROCm"); + #endif + } else if (std::is_same_v && std::is_same_v) { + #ifdef USE_ROCM + TORCH_CHECK(false, "gemm input type at::Half and output type float is not supported for ROCm"); + #endif + if (at::globalContext().allowFP16AccumulationCuBLAS()) + TORCH_CHECK(false, "gemm input type at::Half and output type float is not supported with allowFP16AccumulationCuBLAS"); + } + + using opmath_t = at::opmath_type; + opmath_t beta_val = 0; // bias is added in epilogue + + cudaDataType_t abType = CUDA_R_32F; + cudaDataType_t cType = CUDA_R_32F; + cublasComputeType_t computeType = CUBLAS_COMPUTE_32F; + cudaDataType_t scaleType = CUDA_R_32F; + CuBlasLtMatmulPreference preference; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) void * alpha_ptr = &alpha_val; void * beta_ptr = &beta_val; #ifndef USE_ROCM @@ -1336,14 +1848,22 @@ void gemm_and_bias( at::Half hbeta_val; #endif if constexpr (std::is_same_v) { +<<<<<<< HEAD abcType = CUDA_R_64F; +======= + abType = CUDA_R_64F; + cType = CUDA_R_64F; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) computeType = CUBLAS_COMPUTE_64F; scaleType = CUDA_R_64F; } else if constexpr (std::is_same_v) { if (at::globalContext().allowTF32CuBLAS()) { computeType = CUBLAS_COMPUTE_32F_FAST_TF32; } +<<<<<<< HEAD abcType = CUDA_R_32F; +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } else if constexpr (std::is_same_v) { #ifndef USE_ROCM cudaDeviceProp* prop = at::cuda::getCurrentDeviceProperties(); @@ -1356,9 +1876,29 @@ void gemm_and_bias( beta_ptr = &hbeta_val; } #endif +<<<<<<< HEAD abcType = CUDA_R_16F; } else if constexpr (std::is_same_v) { abcType = CUDA_R_16BF; +======= + abType = CUDA_R_16F; + cType = (std::is_same_v) ? CUDA_R_32F : CUDA_R_16F; +#ifndef USE_ROCM + if (!at::globalContext().allowFP16ReductionCuBLAS()) { + preference.setAttribute(CUBLASLT_MATMUL_PREF_REDUCTION_SCHEME_MASK, + CUBLASLT_REDUCTION_SCHEME_COMPUTE_TYPE | CUBLASLT_REDUCTION_SCHEME_NONE); + } +#endif + } else if constexpr (std::is_same_v) { + abType = CUDA_R_16BF; + cType = (std::is_same_v) ? CUDA_R_32F : CUDA_R_16BF; +#ifndef USE_ROCM + if (!at::globalContext().allowBF16ReductionCuBLAS()) { + preference.setAttribute(CUBLASLT_MATMUL_PREF_REDUCTION_SCHEME_MASK, + CUBLASLT_REDUCTION_SCHEME_COMPUTE_TYPE | CUBLASLT_REDUCTION_SCHEME_NONE); + } +#endif +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } CuBlasLtMatmulDescriptor computeDesc(computeType, scaleType); @@ -1366,7 +1906,10 @@ void gemm_and_bias( computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_TRANSA, transa); cublasOperation_t transb = transpose_mat2 ? CUBLAS_OP_T : CUBLAS_OP_N; computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_TRANSB, transb); +<<<<<<< HEAD auto stream = at::cuda::getCurrentCUDAStream(); +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #ifndef USE_ROCM if (at::globalContext()._SMCarveout_EXPERIMENTAL().has_value()) { computeDesc.setAttribute( @@ -1374,12 +1917,15 @@ void gemm_and_bias( at::cuda::getCurrentDeviceProperties()->multiProcessorCount - at::globalContext()._SMCarveout_EXPERIMENTAL().value()); } +<<<<<<< HEAD #else if (at::globalContext()._SMCarveout_EXPERIMENTAL().has_value()) { stream = _getCarveoutStream( at::globalContext()._SMCarveout_EXPERIMENTAL().value()); _syncCurrentWithCarveoutStream(stream, true); } +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #endif cublasLtEpilogue_t epilogue = CUBLASLT_EPILOGUE_BIAS; if (activation == GEMMAndBiasActivationEpilogue::RELU) { @@ -1395,11 +1941,18 @@ void gemm_and_bias( computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_BIAS_POINTER, bias); } +<<<<<<< HEAD CuBlasLtMatrixLayout Adesc(abcType, m, k, mat1_ld, transpose_mat1); CuBlasLtMatrixLayout Bdesc(abcType, k, n, mat2_ld, transpose_mat2); CuBlasLtMatrixLayout Cdesc(abcType, m, n, result_ld); CuBlasLtMatmulPreference preference; +======= + CuBlasLtMatrixLayout Adesc(abType, m, k, mat1_ld, transpose_mat1); + CuBlasLtMatrixLayout Bdesc(abType, k, n, mat2_ld, transpose_mat2); + CuBlasLtMatrixLayout Cdesc(cType, m, n, result_ld); + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) auto ltworkspace = CublasLtWorkspace(); preference.setAttribute(CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES, ltworkspace.size); @@ -1428,11 +1981,20 @@ void gemm_and_bias( 1, &heuristicResult, &returnedResult)); +<<<<<<< HEAD if (returnedResult == 0) { TORCH_CUDABLAS_CHECK(CUBLAS_STATUS_NOT_SUPPORTED); } cublasStatus_t cublasStatus = cublasLtMatmul( +======= + cublasStatus_t cublasStatus = CUBLAS_STATUS_SUCCESS; + if (returnedResult == 0) { + cublasStatus = CUBLAS_STATUS_NOT_SUPPORTED; + } + else { + cublasStatus = cublasLtMatmul( +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ltHandle, computeDesc.descriptor(), alpha_ptr, @@ -1448,6 +2010,7 @@ void gemm_and_bias( &heuristicResult.algo, ltworkspace.ptr, ltworkspace.size, +<<<<<<< HEAD stream); #ifdef USE_ROCM if (at::globalContext()._SMCarveout_EXPERIMENTAL().has_value()) { @@ -1457,6 +2020,13 @@ void gemm_and_bias( TORCH_CHECK( cublasStatus == CUBLAS_STATUS_SUCCESS, "CUDA error: ", +======= + at::cuda::getCurrentCUDAStream()); + } + if (cublasStatus != CUBLAS_STATUS_SUCCESS) { + TORCH_WARN( + "gemm_and_bias error: ", +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) at::cuda::blas::_cublasGetErrorEnum(cublasStatus), " when calling cublasLtMatmul with transpose_mat1 ", transpose_mat1, @@ -1474,6 +2044,7 @@ void gemm_and_bias( mat2_ld, " result_ld ", result_ld, +<<<<<<< HEAD " abcType ", abcType, " computeType ", @@ -1483,6 +2054,23 @@ void gemm_and_bias( } template void gemm_and_bias( +======= + " abType ", + abType, + " cType ", + cType, + " computeType ", + computeType, + " scaleType ", + scaleType, + ". Will attempt to recover by calling unfused cublas path."); + return false; + } + return true; +} + +template bool gemm_and_bias( +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) bool transpose_mat1, bool transpose_mat2, int64_t m, @@ -1498,7 +2086,11 @@ template void gemm_and_bias( int64_t result_ld, GEMMAndBiasActivationEpilogue activation); +<<<<<<< HEAD template void gemm_and_bias( +======= +template bool gemm_and_bias( +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) bool transpose_mat1, bool transpose_mat2, int64_t m, @@ -1514,7 +2106,11 @@ template void gemm_and_bias( int64_t result_ld, GEMMAndBiasActivationEpilogue activation); +<<<<<<< HEAD template void gemm_and_bias( +======= +template bool gemm_and_bias( +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) bool transpose_mat1, bool transpose_mat2, int64_t m, @@ -1530,7 +2126,27 @@ template void gemm_and_bias( int64_t result_ld, GEMMAndBiasActivationEpilogue activation); +<<<<<<< HEAD template void gemm_and_bias( +======= +template bool gemm_and_bias( + bool transpose_mat1, + bool transpose_mat2, + int64_t m, + int64_t n, + int64_t k, + at::opmath_type alpha_val, + const at::Half* mat1_ptr, + int64_t mat1_ld, + const at::Half* mat2_ptr, + int64_t mat2_ld, + const at::Half* bias, + float* result_ptr, + int64_t result_ld, + GEMMAndBiasActivationEpilogue activation); + +template bool gemm_and_bias( +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) bool transpose_mat1, bool transpose_mat2, int64_t m, @@ -1546,6 +2162,25 @@ template void gemm_and_bias( int64_t result_ld, GEMMAndBiasActivationEpilogue activation); +<<<<<<< HEAD +======= +template bool gemm_and_bias( + bool transpose_mat1, + bool transpose_mat2, + int64_t m, + int64_t n, + int64_t k, + at::opmath_type alpha_val, + const at::BFloat16* mat1_ptr, + int64_t mat1_ld, + const at::BFloat16* mat2_ptr, + int64_t mat2_ld, + const at::BFloat16* bias, + float* result_ptr, + int64_t result_ld, + GEMMAndBiasActivationEpilogue activation); + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) void scaled_gemm( char transa, char transb, @@ -1570,6 +2205,11 @@ void scaled_gemm( ScalarType result_dtype, bool use_fast_accum, bool use_rowwise) { +<<<<<<< HEAD +======= + // Note: see `cublasCommonArgs` for various non-intuitive manupulations + // of input arguments to this function. +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #if CUDA_VERSION >= 11080 || defined(USE_ROCM) const auto computeType = CUBLAS_COMPUTE_32F; const auto scaleType = CUDA_R_32F; @@ -1608,7 +2248,10 @@ void scaled_gemm( if (result_scale_ptr != nullptr) { computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_D_SCALE_POINTER, result_scale_ptr); } +<<<<<<< HEAD auto stream = at::cuda::getCurrentCUDAStream(); +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #ifndef USE_ROCM if (at::globalContext()._SMCarveout_EXPERIMENTAL().has_value()) { computeDesc.setAttribute( @@ -1616,6 +2259,7 @@ void scaled_gemm( at::cuda::getCurrentDeviceProperties()->multiProcessorCount - at::globalContext()._SMCarveout_EXPERIMENTAL().value()); } +<<<<<<< HEAD #else if (at::globalContext()._SMCarveout_EXPERIMENTAL().has_value()) { stream = _getCarveoutStream( @@ -1627,6 +2271,13 @@ void scaled_gemm( const int8_t fastAccuMode = use_fast_accum ? 1 : 0; computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_FAST_ACCUM, fastAccuMode); #endif +======= +#endif // ifndef USE_ROCM +#ifndef USE_ROCM + const int8_t fastAccuMode = use_fast_accum ? 1 : 0; + computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_FAST_ACCUM, fastAccuMode); +#endif // ifndef USE_ROCM +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) CuBlasLtMatrixLayout Adesc(ScalarTypeToCudaDataType(mat1_dtype), m, k, mat1_ld, transa == 't'); CuBlasLtMatrixLayout Bdesc(ScalarTypeToCudaDataType(mat2_dtype), k, n, mat2_ld, transb == 't'); #ifdef USE_ROCM @@ -1634,7 +2285,11 @@ void scaled_gemm( CuBlasLtMatrixLayout Cdesc(ScalarTypeToCudaDataType(result_dtype), m, n, result_ld); #else CuBlasLtMatrixLayout Cdesc(ScalarTypeToCudaDataType(bias_dtype), m, n, result_ld); +<<<<<<< HEAD #endif +======= +#endif // ifdef USE_ROCM +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) CuBlasLtMatrixLayout Ddesc(ScalarTypeToCudaDataType(result_dtype), m, n, result_ld); if (bias_ptr) { computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_BIAS_POINTER, bias_ptr); @@ -1648,7 +2303,18 @@ void scaled_gemm( computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_B_SCALE_MODE, CUBLASLT_MATMUL_MATRIX_SCALE_VEC32_UE8M0); #else TORCH_CHECK(false, "scaled_gemm with `torch.float8_e8m0fnu` scales is only supported for CUDA 12.8 or ROCm 7.0(with gfx950) and above"); +<<<<<<< HEAD #endif // CUDA_VERSION >= 12080 +======= +#endif // if CUDA_VERSION >= 12080 + } else if (mat1_scale_dtype == kFloat8_e4m3fn && mat2_scale_dtype == kFloat8_e4m3fn) { +#if CUDA_VERSION >= 12080 + computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_A_SCALE_MODE, CUBLASLT_MATMUL_MATRIX_SCALE_VEC16_UE4M3); + computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_B_SCALE_MODE, CUBLASLT_MATMUL_MATRIX_SCALE_VEC16_UE4M3); +#else + TORCH_CHECK(false, "scaled_gemm with `torch.float8_e4m3fn` scales is only supported for CUDA 12.8 and above"); +#endif // if CUDA_VERSION >= 12080 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } else if (mat1_scale_dtype == kFloat && mat2_scale_dtype == kFloat && use_rowwise) { #if CUDA_VERSION >= 12090 || (defined(USE_ROCM) && defined(HIPBLASLT_OUTER_VEC)) computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_A_SCALE_MODE, CUBLASLT_MATMUL_MATRIX_SCALE_OUTER_VEC_32F); @@ -1660,6 +2326,10 @@ void scaled_gemm( #endif // if CUDA_VERSION >= 12090 } +<<<<<<< HEAD +======= + auto stream = c10::cuda::getCurrentCUDAStream(); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) CuBlasLtMatmulPreference preference; auto ltworkspace = CublasLtWorkspace(); preference.setAttribute(CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES, ltworkspace.size); @@ -1723,7 +2393,11 @@ void scaled_gemm( } } TORCH_CHECK(found, "could not find valid hipblaslt solution"); +<<<<<<< HEAD #endif +======= +#endif // ifndef USE_ROCM +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } cublasStatus_t cublasStatus = cublasLtMatmul( ltHandle, @@ -1738,7 +2412,11 @@ void scaled_gemm( result_ptr, // unused, since beta_val is 0, but hipblaslt can't handle nullptr #else nullptr, +<<<<<<< HEAD #endif +======= +#endif // ifdef USE_ROCM +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) Cdesc.descriptor(), result_ptr, Ddesc.descriptor(), @@ -1746,11 +2424,14 @@ void scaled_gemm( ltworkspace.ptr, ltworkspace.size, stream); +<<<<<<< HEAD #ifdef USE_ROCM if (at::globalContext()._SMCarveout_EXPERIMENTAL().has_value()) { _syncCurrentWithCarveoutStream(stream, false); } #endif +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) TORCH_CHECK( cublasStatus == CUBLAS_STATUS_SUCCESS, "CUDA error: ", @@ -1776,7 +2457,11 @@ void scaled_gemm( " scaleType ", scaleType); return; +<<<<<<< HEAD #endif // CUDA_VERSION >= 11080 || defined(USE_ROCM) +======= +#endif // if CUDA_VERSION >= 11080 || defined(USE_ROCM) +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) TORCH_CHECK(false, "scaled_gemm is only supported for CUDA 11.8 and above"); } @@ -1804,7 +2489,10 @@ void int8_gemm( computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_TRANSA, transa); cublasOperation_t transb = transpose_mat2 ? CUBLAS_OP_T : CUBLAS_OP_N; computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_TRANSB, transb); +<<<<<<< HEAD auto stream = at::cuda::getCurrentCUDAStream(); +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #ifndef USE_ROCM if (at::globalContext()._SMCarveout_EXPERIMENTAL().has_value()) { computeDesc.setAttribute( @@ -1812,12 +2500,15 @@ void int8_gemm( at::cuda::getCurrentDeviceProperties()->multiProcessorCount - at::globalContext()._SMCarveout_EXPERIMENTAL().value()); } +<<<<<<< HEAD #else if (at::globalContext()._SMCarveout_EXPERIMENTAL().has_value()) { stream = _getCarveoutStream( at::globalContext()._SMCarveout_EXPERIMENTAL().value()); _syncCurrentWithCarveoutStream(stream, true); } +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #endif CuBlasLtMatrixLayout Adesc(abType, m, k, mat1_ld, transpose_mat1); @@ -1879,7 +2570,11 @@ void int8_gemm( #else 0, #endif +<<<<<<< HEAD stream); +======= + at::cuda::getCurrentCUDAStream()); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) TORCH_CHECK( cublasStatus == CUBLAS_STATUS_SUCCESS, "CUDA error: ", @@ -1908,11 +2603,14 @@ void int8_gemm( computeType, " scaleType ", scaleType); +<<<<<<< HEAD #ifdef USE_ROCM if (at::globalContext()._SMCarveout_EXPERIMENTAL().has_value()) { _syncCurrentWithCarveoutStream(stream, false); } #endif +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } template <> @@ -2224,6 +2922,11 @@ void vdot>(CUDABLAS_DOT_ARGTYPES(c10::complex)) { reinterpret_cast(result))); } +<<<<<<< HEAD +======= +// HIP on Windows does not support +#if !(defined(USE_ROCM) && defined(_MSC_VER)) +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) template <> void getrsBatched(CUDABLAS_GETRS_ARGTYPES(float)) { TORCH_CUDABLAS_CHECK(cublasSgetrsBatched( @@ -2422,5 +3125,9 @@ void gelsBatched>(CUDABLAS_GELS_BATCHED_ARGTYPES(c10::comple devInfoArray, batchSize)); } +<<<<<<< HEAD +======= +#endif // !(defined(USE_ROCM) && defined(_MSC_VER)) +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } // namespace at::cuda::blas diff --git a/aten/src/ATen/cuda/CUDABlas.h b/aten/src/ATen/cuda/CUDABlas.h index 6075e7b9c9d8..eaa071bd5a4e 100644 --- a/aten/src/ATen/cuda/CUDABlas.h +++ b/aten/src/ATen/cuda/CUDABlas.h @@ -39,6 +39,7 @@ class PointerModeGuard { /* LEVEL 3 BLAS FUNCTIONS */ +<<<<<<< HEAD #define CUDABLAS_GEMM_ARGTYPES(Dtype) \ char transa, char transb, int64_t m, int64_t n, int64_t k, at::opmath_type alpha, \ const Dtype *a, int64_t lda, const Dtype *b, int64_t ldb, at::opmath_type beta,\ @@ -51,6 +52,28 @@ inline void gemm(CUDABLAS_GEMM_ARGTYPES(Dtype)) { static_assert(false&&sizeof(Dtype),"at::cuda::blas::gemm: not implemented"); } +======= +#define CUDABLAS_GEMM_ARGTYPES(Dtype) CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(Dtype, Dtype) + +#define CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(Dtype, C_Dtype) \ + char transa, char transb, int64_t m, int64_t n, int64_t k, at::opmath_type alpha, \ + const Dtype *a, int64_t lda, const Dtype *b, int64_t ldb, at::opmath_type beta,\ + C_Dtype *c, int64_t ldc + +#define CUDABLAS_GEMM_ARGS(Dtype) transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc + +#define CUDABLAS_GEMM_DTYPE_IS_FLOAT_TYPE_AND_C_DTYPE_IS_FLOAT \ + ((std::is_same::value || std::is_same::value) && std::is_same::value) + +template ::type* = nullptr> +inline void gemm(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(Dtype, C_Dtype)) { + static_assert(false&&sizeof(Dtype),"at::cuda::blas::gemm: not implemented"); +} + +template ::type* = nullptr> +void gemm(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(Dtype, C_Dtype)); + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) template <> void gemm(CUDABLAS_GEMM_ARGTYPES(double)); template <> @@ -63,9 +86,19 @@ template <> void gemm(CUDABLAS_GEMM_ARGTYPES(at::Half)); template <> void gemm(CUDABLAS_GEMM_ARGTYPES(at::BFloat16)); +<<<<<<< HEAD template inline void gemm_internal(CUDABLAS_GEMM_ARGTYPES(Dtype)) { +======= +template<> +void gemm(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(at::Half, float)); +template<> +void gemm(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(at::BFloat16, float)); + +template +inline void gemm_internal(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(Dtype, C_Dtype)) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) static_assert(false&&sizeof(Dtype),"at::cuda::blas::gemm_internal: not implemented"); } @@ -81,6 +114,13 @@ template <> void gemm_internal(CUDABLAS_GEMM_ARGTYPES(at::Half)); template <> void gemm_internal(CUDABLAS_GEMM_ARGTYPES(at::BFloat16)); +<<<<<<< HEAD +======= +template<> +void gemm_internal(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(at::Half, float)); +template<> +void gemm_internal(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(at::BFloat16, float)); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) enum GEMMAndBiasActivationEpilogue { None, @@ -90,8 +130,13 @@ enum GEMMAndBiasActivationEpilogue { // NOTE: GELU activation is not supported prior to CUDA 11.4 and will // do nothing if passed in that case. +<<<<<<< HEAD template void gemm_and_bias( +======= +template +bool gemm_and_bias( +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) bool transpose_mat1, bool transpose_mat2, int64_t m, @@ -103,7 +148,11 @@ void gemm_and_bias( const Dtype* mat2_ptr, int64_t mat2_ld, const Dtype* bias, +<<<<<<< HEAD Dtype* result_ptr, +======= + C_Dtype* result_ptr, +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) int64_t result_ld, GEMMAndBiasActivationEpilogue activation = GEMMAndBiasActivationEpilogue::None); @@ -145,20 +194,41 @@ void scaled_gemm( bool use_fast_accum, bool use_rowwise); +<<<<<<< HEAD #define CUDABLAS_BGEMM_ARGTYPES(Dtype) \ char transa, char transb, int64_t m, int64_t n, int64_t k, at::opmath_type alpha, \ const Dtype *a, int64_t lda, int64_t stridea, \ const Dtype *b, int64_t ldb, int64_t strideb, \ at::opmath_type beta, Dtype *c, int64_t ldc, int64_t stridec, int64_t num_batches +======= +#define CUDABLAS_BGEMM_ARGTYPES(Dtype) CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(Dtype, Dtype) + +#define CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(Dtype, C_Dtype) \ + char transa, char transb, int64_t m, int64_t n, int64_t k, at::opmath_type alpha, \ + const Dtype *a, int64_t lda, int64_t stridea, \ + const Dtype *b, int64_t ldb, int64_t strideb, \ + at::opmath_type beta, C_Dtype *c, int64_t ldc, int64_t stridec, int64_t num_batches +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #define CUDABLAS_BGEMM_ARGS(Dtype) \ transa, transb, m, n, k, alpha, a, lda, stridea, b, ldb, strideb, beta, c, ldc, stridec, num_batches +<<<<<<< HEAD template inline void bgemm(CUDABLAS_BGEMM_ARGTYPES(Dtype)) { static_assert(false&&sizeof(Dtype),"at::cuda::blas::bgemm: not implemented"); } +======= +template ::type* = nullptr> +inline void bgemm(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(Dtype, C_Dtype)) { + static_assert(false&&sizeof(Dtype),"at::cuda::blas::bgemm: not implemented"); +} + +template ::type* = nullptr> +void bgemm(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(Dtype, C_Dtype)); + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) template <> void bgemm(CUDABLAS_BGEMM_ARGTYPES(double)); template <> @@ -171,9 +241,19 @@ template <> void bgemm(CUDABLAS_BGEMM_ARGTYPES(at::Half)); template <> void bgemm(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16)); +<<<<<<< HEAD template inline void bgemm_internal(CUDABLAS_BGEMM_ARGTYPES(Dtype)) { +======= +template<> +void bgemm(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(at::Half, float)); +template<> +void bgemm(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(at::BFloat16, float)); + +template +inline void bgemm_internal(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(Dtype, C_Dtype)) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) static_assert(false&&sizeof(Dtype),"at::cuda::blas::bgemm_internal: not implemented"); } @@ -189,6 +269,13 @@ template <> void bgemm_internal(CUDABLAS_BGEMM_ARGTYPES(at::Half)); template <> void bgemm_internal(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16)); +<<<<<<< HEAD +======= +template<> +void bgemm_internal(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(at::Half, float)); +template<> +void bgemm_internal(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(at::BFloat16, float)); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #define CUDABLAS_TRSM_ARGTYPES(Dtype) \ cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, \ @@ -292,6 +379,24 @@ void vdot>(CUDABLAS_DOT_ARGTYPES(c10::complex)); int n, int nrhs, Dtype** dA_array, int lda, int* ipiv_array, \ Dtype** dB_array, int ldb, int* info_array, int batchsize +<<<<<<< HEAD +======= +#define CUDABLAS_GEQRF_BATCHED_ARGTYPES(Dtype) \ + cublasHandle_t handle, int m, int n, Dtype **A_array, int lda, \ + Dtype **tau_array, int *info, int batchsize + +#define CUDABLAS_GETRF_ARGTYPES(Dtype) \ + int n, Dtype** dA_array, int ldda, int* ipiv_array, int* info_array, int batchsize + +#define CUDABLAS_GELS_BATCHED_ARGTYPES(Dtype) \ + cublasHandle_t handle, cublasOperation_t trans, \ + int m, int n, int nrhs, Dtype** dA_array, int ldda, \ + Dtype** dC_array, int lddc, int* info, int *devInfoArray, int batchSize + +// HIP on Windows does not support getrs, geqrf, getrf, gels +#if !(defined(USE_ROCM) && defined(_MSC_VER)) + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) template void getrsBatched(CUDABLAS_GETRS_ARGTYPES(Dtype)) { static_assert(false&&sizeof(Dtype),"at::cuda::blas::getrsBatched: not implemented"); @@ -305,10 +410,13 @@ TORCH_CUDA_CU_API void getrsBatched>(CUDABLAS_GETRS_ARGTYPES template<> TORCH_CUDA_CU_API void getrsBatched>(CUDABLAS_GETRS_ARGTYPES(c10::complex)); +<<<<<<< HEAD #define CUDABLAS_GEQRF_BATCHED_ARGTYPES(Dtype) \ cublasHandle_t handle, int m, int n, Dtype **A_array, int lda, \ Dtype **tau_array, int *info, int batchsize +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) template void geqrfBatched(CUDABLAS_GEQRF_BATCHED_ARGTYPES(Dtype)) { static_assert(false&&sizeof(Dtype), "at::cuda::blas::geqrfBatched: not implemented"); @@ -324,12 +432,18 @@ template <> TORCH_CUDA_CU_API void geqrfBatched>( CUDABLAS_GEQRF_BATCHED_ARGTYPES(c10::complex)); +<<<<<<< HEAD #define CUDABLAS_GETRF_ARGTYPES(Dtype) \ int n, Dtype** dA_array, int ldda, int* ipiv_array, int* info_array, int batchsize template void getrfBatched(CUDABLAS_GETRF_ARGTYPES(Dtype)) { TORCH_CHECK(false, "at::cuda::blas::getrfBatched: not implemented"); +======= +template +void getrfBatched(CUDABLAS_GETRF_ARGTYPES(Dtype)) { + static_assert(false&&sizeof(Dtype), "at::cuda::blas::getrfBatched: not implemented"); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } template<> TORCH_CUDA_CU_API void getrfBatched(CUDABLAS_GETRF_ARGTYPES(float)); @@ -340,6 +454,7 @@ TORCH_CUDA_CU_API void getrfBatched>(CUDABLAS_GETRF_ARGTYPE template<> TORCH_CUDA_CU_API void getrfBatched>(CUDABLAS_GETRF_ARGTYPES(c10::complex)); +<<<<<<< HEAD #define CUDABLAS_GELS_BATCHED_ARGTYPES(Dtype) \ cublasHandle_t handle, cublasOperation_t trans, int m, int n, int nrhs, Dtype** dA_array, int ldda, Dtype** dC_array, int lddc, int* info, int *devInfoArray, int batchSize @@ -348,6 +463,12 @@ void gelsBatched(CUDABLAS_GELS_BATCHED_ARGTYPES(Dtype)) { static_assert(false&&sizeof(Dtype),"at::cuda::blas::gelsBatched: not implemented"); } +======= +template +void gelsBatched(CUDABLAS_GELS_BATCHED_ARGTYPES(Dtype)) { + static_assert(false&&sizeof(Dtype), "at::cuda::blas::gelsBatched: not implemented"); +} +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) template<> TORCH_CUDA_CU_API void gelsBatched(CUDABLAS_GELS_BATCHED_ARGTYPES(double)); template<> @@ -357,4 +478,31 @@ TORCH_CUDA_CU_API void gelsBatched>(CUDABLAS_GELS_BATCHED_A template<> TORCH_CUDA_CU_API void gelsBatched>(CUDABLAS_GELS_BATCHED_ARGTYPES(c10::complex)); +<<<<<<< HEAD +======= +#else // !(defined(USE_ROCM) && defined(_MSC_VER)) + +template +void getrsBatched(CUDABLAS_GETRS_ARGTYPES(Dtype)) { + TORCH_CHECK(false, "at::cuda::blas::getrsBatched: not supported for HIP on Windows"); +} + +template +void geqrfBatched(CUDABLAS_GEQRF_BATCHED_ARGTYPES(Dtype)) { + TORCH_CHECK(false, "at::cuda::blas::geqrfBatched: not supported for HIP on Windows"); +} + +template +void getrfBatched(CUDABLAS_GETRF_ARGTYPES(Dtype)) { + TORCH_CHECK(false, "at::cuda::blas::getrfBatched: not supported for HIP on Windows"); +} + +template +void gelsBatched(CUDABLAS_GELS_BATCHED_ARGTYPES(Dtype)) { + TORCH_CHECK(false, "at::cuda::blas::gelsBatched: not supported for HIP on Windows"); +} + +#endif // !(defined(USE_ROCM) && defined(_MSC_VER)) + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } // namespace at::cuda::blas diff --git a/aten/src/ATen/cuda/CUDAConfig.h.in b/aten/src/ATen/cuda/CUDAConfig.h.in index 7c7f2cc7470a..1cda19296d9a 100644 --- a/aten/src/ATen/cuda/CUDAConfig.h.in +++ b/aten/src/ATen/cuda/CUDAConfig.h.in @@ -8,6 +8,10 @@ // only be included from C++ files. #define AT_CUDNN_ENABLED() @AT_CUDNN_ENABLED@ #define AT_CUSPARSELT_ENABLED() @AT_CUSPARSELT_ENABLED@ +<<<<<<< HEAD +======= +#define AT_HIPSPARSELT_ENABLED() @AT_HIPSPARSELT_ENABLED@ +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #define AT_ROCM_ENABLED() @AT_ROCM_ENABLED@ #define AT_MAGMA_ENABLED() @AT_MAGMA_ENABLED@ diff --git a/aten/src/ATen/cuda/CUDADataType.h b/aten/src/ATen/cuda/CUDADataType.h index b3ac2b39fcfb..4ebe9aef3f93 100644 --- a/aten/src/ATen/cuda/CUDADataType.h +++ b/aten/src/ATen/cuda/CUDADataType.h @@ -78,7 +78,11 @@ inline cudaDataType ScalarTypeToCudaDataType(const c10::ScalarType& scalar_type) return CUDA_R_64I; case c10::ScalarType::BFloat16: return CUDA_R_16BF; +<<<<<<< HEAD #if (defined(CUDA_VERSION) && CUDA_VERSION >= 11080) || (defined(USE_ROCM) && ROCM_VERSION >= 60300) +======= +#if !defined(USE_ROCM) || ROCM_VERSION >= 60300 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) case c10::ScalarType::Float8_e4m3fn: return CUDA_R_8F_E4M3; case c10::ScalarType::Float8_e5m2: @@ -90,6 +94,13 @@ inline cudaDataType ScalarTypeToCudaDataType(const c10::ScalarType& scalar_type) case c10::ScalarType::Float8_e5m2fnuz: return HIP_R_8F_E5M2_FNUZ; #endif +<<<<<<< HEAD +======= +#if (defined(CUDA_VERSION) && CUDA_VERSION >= 12080) || (defined(USE_ROCM) && ROCM_VERSION >= 70000) + case c10::ScalarType::Float4_e2m1fn_x2: + return CUDA_R_4F_E2M1; +#endif +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) default: TORCH_INTERNAL_ASSERT(false, "Cannot convert ScalarType ", scalar_type, " to cudaDataType.") } diff --git a/aten/src/ATen/cuda/CUDAEvent.h b/aten/src/ATen/cuda/CUDAEvent.h index 94ce34645b02..b231589676fe 100644 --- a/aten/src/ATen/cuda/CUDAEvent.h +++ b/aten/src/ATen/cuda/CUDAEvent.h @@ -13,6 +13,20 @@ #include #include +<<<<<<< HEAD +======= +/* +* `cudaEventExternal` is a torch-specific flag that is used to +* indicate that the CUDAEvent will be used only for synchronization +* with work outside of the cuda graph, rather than creation of +* cross-stream dependencies within a cuda graph. Resources: +* https://docs.nvidia.com/cuda/archive/12.9.0/cuda-c-programming-guide/index.html#cross-stream-dependencies-and-events +* https://docs.nvidia.com/cuda/archive/12.9.0/cuda-runtime-api/group__CUDART__TYPES.html#group__CUDART__TYPES_1g3457b81d1d32c6a00f6132fbc2693d47 +* https://docs.nvidia.com/cuda/archive/12.9.0/cuda-runtime-api/group__CUDART__TYPES.html#group__CUDART__TYPES_1g0c23426b7252eaa9cef695859991304e +*/ +#define cudaEventExternal 0x08 + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) namespace at::cuda { /* @@ -118,7 +132,18 @@ struct TORCH_CUDA_CPP_API CUDAEvent { TORCH_CHECK(device_index_ == stream.device_index(), "Event device ", device_index_, " does not match recording stream's device ", stream.device_index(), "."); CUDAGuard guard(device_index_); +<<<<<<< HEAD AT_CUDA_CHECK(cudaEventRecord(event_, stream)); +======= + +#ifndef USE_ROCM + // it is an error to use cudaEventRecordExternal when not doing stream capture + unsigned int flags = (c10::cuda::currentStreamCaptureStatusMayInitCtx() != c10::cuda::CaptureStatus::None && external_) ? cudaEventRecordExternal : cudaEventRecordDefault; + AT_CUDA_CHECK(cudaEventRecordWithFlags(event_, stream, flags)); +#else + AT_CUDA_CHECK(cudaEventRecord(event_, stream)); +#endif +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace(); if (C10_UNLIKELY(interp)) { (*interp)->trace_gpu_event_record(at::kCUDA, @@ -134,7 +159,17 @@ struct TORCH_CUDA_CPP_API CUDAEvent { void block(const CUDAStream& stream) { if (is_created_) { CUDAGuard guard(stream.device_index()); +<<<<<<< HEAD AT_CUDA_CHECK(cudaStreamWaitEvent(stream, event_, 0)); +======= +#ifndef USE_ROCM + // it is an error to use cudaEventWaitExternal when not doing stream capture + unsigned int flags = (c10::cuda::currentStreamCaptureStatusMayInitCtx() != c10::cuda::CaptureStatus::None && external_) ? cudaEventWaitExternal : cudaEventWaitDefault; + AT_CUDA_CHECK(cudaStreamWaitEvent(stream, event_, flags)); +#else + AT_CUDA_CHECK(cudaStreamWaitEvent(stream, event_)); +#endif +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace(); if (C10_UNLIKELY(interp)) { (*interp)->trace_gpu_event_wait(at::kCUDA, @@ -147,8 +182,21 @@ struct TORCH_CUDA_CPP_API CUDAEvent { // Note: cudaEventElapsedTime can be safely called from any device float elapsed_time(const CUDAEvent& other) const { +<<<<<<< HEAD TORCH_CHECK(is_created_ && other.isCreated(), "Both events must be recorded before calculating elapsed time."); +======= + TORCH_CHECK_VALUE( + !(flags_ & cudaEventDisableTiming) && !(other.flags_ & cudaEventDisableTiming), + "Both events must be created with argument 'enable_timing=True'."); + TORCH_CHECK_VALUE( + is_created_ && other.isCreated(), + "Both events must be recorded before calculating elapsed time."); + TORCH_CHECK( + query() && other.query(), + "Both events must be completed before calculating elapsed time."); + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) float time_ms = 0; // We do not strictly have to set the device index to the same as our event, // but if we don't and the current device is not initialized, it will @@ -185,10 +233,22 @@ struct TORCH_CUDA_CPP_API CUDAEvent { unsigned int flags_ = cudaEventDisableTiming; bool is_created_ = false; bool was_recorded_ = false; +<<<<<<< HEAD +======= + bool external_ = false; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) DeviceIndex device_index_ = -1; cudaEvent_t event_{}; void createEvent(DeviceIndex device_index) { +<<<<<<< HEAD +======= + external_ = (flags_ & cudaEventExternal) != 0; +#ifdef USE_ROCM + TORCH_CHECK(!external_, "External events are disallowed in rocm"); +#endif + flags_ &= ~cudaEventExternal; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) device_index_ = device_index; CUDAGuard guard(device_index_); AT_CUDA_CHECK(cudaEventCreateWithFlags(&event_, flags_)); diff --git a/aten/src/ATen/cuda/CUDAGraph.cpp b/aten/src/ATen/cuda/CUDAGraph.cpp index 3f2916862cac..5d56bfa5b352 100644 --- a/aten/src/ATen/cuda/CUDAGraph.cpp +++ b/aten/src/ATen/cuda/CUDAGraph.cpp @@ -38,9 +38,16 @@ MempoolId_t graph_pool_handle() { * describes memory management for captures. */ +<<<<<<< HEAD CUDAGraph::CUDAGraph() // CUDAStreams may not be default-constructed. : capture_stream_(at::cuda::getCurrentCUDAStream()) { +======= +CUDAGraph::CUDAGraph(bool keep_graph) + // CUDAStreams may not be default-constructed. + : capture_stream_(at::cuda::getCurrentCUDAStream()), + keep_graph_(keep_graph) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } void CUDAGraph::register_generator_state( @@ -126,8 +133,42 @@ void CUDAGraph::capture_end() { c10::cuda::CUDACachingAllocator::endAllocateToPool(capture_dev_, mempool_id_); TORCH_CHECK(graph_ != nullptr, "Invalid capture."); +<<<<<<< HEAD + has_graph_ = true; + +======= + + for (auto& [generator_state, wholegraph_increments] : + captured_generator_states_) { + wholegraph_increments = generator_state->capture_epilogue(); + } + + size_t numCUDAGraphNodes = 0; + AT_CUDA_CHECK(cudaGraphGetNodes(graph_, nullptr, &numCUDAGraphNodes)); + if (numCUDAGraphNodes == 0) { + TORCH_WARN("The CUDA Graph is empty. This usually means that the graph was ", + "attempted to be captured on wrong device or stream."); + } + + capture_ended_ = true; has_graph_ = true; + if (!keep_graph_) { + instantiate(); + if (!_cuda_graphs_debug) { + AT_CUDA_CHECK(cudaGraphDestroy(graph_)); + } + has_graph_ = false; + } +} +void CUDAGraph::instantiate() { + TORCH_CHECK(capture_ended_, "capture_end() must have been called before calling instantiate"); + + if (has_graph_exec_) { + TORCH_CHECK(keep_graph_, "instantiate() is intended to be called by the user only when keep_graph=true"); + AT_CUDA_CHECK(cudaGraphExecDestroy(graph_exec_)); + } +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // In typical graph usage some tensors (e.g. the tensors used for graph IO) are not freed // between replays. // If Pytorch compiles and runs with a CUDA 11.4+ toolkit, there's a chance the allocator backend @@ -139,7 +180,11 @@ void CUDAGraph::capture_end() { // https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__GRAPH.html#group__CUDART__GRAPH_1g1accfe1da0c605a577c22d9751a09597 // cudaGraphInstantiateWithFlags // https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__GRAPH.html#group__CUDART__GRAPH_1ga2c652a24ba93e52b99a47bec0888233 +<<<<<<< HEAD #if ((defined(CUDA_VERSION) && CUDA_VERSION >= 11040) || (defined(USE_ROCM) && ROCM_VERSION >= 60200)) +======= +#if !defined(USE_ROCM) || ROCM_VERSION >= 60200 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) int version = 0; AT_CUDA_CHECK(cudaDriverGetVersion(&version)); if (version < 11040) { @@ -154,13 +199,18 @@ void CUDAGraph::capture_end() { #endif //Since ROCm 6.2, we want to go down this path as hipGraphExecDestroy in the destructor will not immediately free the memory. //It will wait for the next sync operation. cudaGraphInstantiateFlagAutoFreeOnLaunch will add async frees after graph launch. +<<<<<<< HEAD #if ((defined(CUDA_VERSION) && CUDA_VERSION >= 11040) || (defined(USE_ROCM) && ROCM_VERSION >= 60200)) +======= +#if !defined(USE_ROCM) || ROCM_VERSION >= 60200 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } else { AT_CUDA_CHECK(cudaGraphInstantiateWithFlags(&graph_exec_, graph_, cudaGraphInstantiateFlagAutoFreeOnLaunch)); } #endif +<<<<<<< HEAD has_graph_exec_ = true; @@ -191,6 +241,20 @@ void CUDAGraph::replay() { TORCH_CHECK(has_graph_exec_, "Called CUDAGraph::replay without a preceding successful capture."); +======= + has_graph_exec_ = true; +} + +void CUDAGraph::replay() { + TORCH_CHECK(capture_ended_, + "Called CUDAGraph::replay without a preceding successful capture."); + + if (!has_graph_exec_) { + TORCH_INTERNAL_ASSERT(keep_graph_); + instantiate(); + } + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) c10::OptionalDeviceGuard device_guard{capture_stream_.device()}; for (auto& [generator_state, wholegraph_increments] : @@ -216,14 +280,26 @@ void CUDAGraph::enable_debug_mode() { } void CUDAGraph::debug_dump(const std::string& debug_path) { +<<<<<<< HEAD #if (defined(CUDA_VERSION) && CUDA_VERSION >= 11030)|| defined(USE_ROCM) if (_cuda_graphs_debug) { +======= +#if defined(CUDA_VERSION) || defined(USE_ROCM) + if (_cuda_graphs_debug || keep_graph_) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) TORCH_WARN("DEBUG: calling debug_dump()"); if (has_graph_) { TORCH_WARN("DEBUG: calling cudaGraphDebugDotPrint() with ", debug_path); C10_CUDA_CHECK_WARN(cudaGraphDebugDotPrint(graph_, debug_path.c_str(), cudaGraphDebugDotFlagsVerbose)); // most verbose output +<<<<<<< HEAD AT_CUDA_CHECK(cudaGraphDestroy(graph_)); has_graph_ = false; +======= + if (!keep_graph_) { + AT_CUDA_CHECK(cudaGraphDestroy(graph_)); + has_graph_ = false; + } +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } } else { TORCH_WARN("CUDA Graphs debug not enabled, set with [graph].enable_debug_mode()"); @@ -233,6 +309,15 @@ void CUDAGraph::debug_dump(const std::string& debug_path) { #endif } +<<<<<<< HEAD +======= +cudaGraph_t CUDAGraph::raw_cuda_graph() { + TORCH_CHECK(keep_graph_, "You cannot access the raw cudaGraph_t instance unless CUDAGraph was initialized with keep_graph=true"); + TORCH_CHECK(has_graph_, "You cannot access the raw cudaGraph_t instance until capture_end() has been called"); + return graph_; +} + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) void CUDAGraph::reset() { // I'd prefer these checks throw exceptions, not print warnings, // but the destructor calls reset(), and at least one CI build @@ -253,9 +338,16 @@ void CUDAGraph::reset() { // and the allocator could end up in all kinds of weird states depending where failure occurred. // If the user catches the failure exception in a script, or is running in REPL or (god forbid) // a Jupyter notebook, I don't see an easy way for reset() to gracefully fix all such possible error states. +<<<<<<< HEAD if (has_graph_ || has_graph_exec_) { // notifyCaptureDestroy may throw. How should we handle this? c10::cuda::CUDACachingAllocator::releasePool(capture_dev_, mempool_id_); +======= + if (capture_ended_) { + // notifyCaptureDestroy may throw. How should we handle this? + c10::cuda::CUDACachingAllocator::releasePool(capture_dev_, mempool_id_); + capture_ended_ = false; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } if (has_graph_) { C10_CUDA_CHECK_WARN(cudaGraphDestroy(graph_)); @@ -269,7 +361,11 @@ void CUDAGraph::reset() { // Returns an id another graph's capture_begin can use to share the same memory pool as this graph. MempoolId_t CUDAGraph::pool() { +<<<<<<< HEAD TORCH_CHECK(has_graph_exec_, +======= +TORCH_CHECK(capture_ended_, +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) "Called CUDAGraph::pool() without a preceding successful capture."); return mempool_id_; } diff --git a/aten/src/ATen/cuda/CUDAGraph.h b/aten/src/ATen/cuda/CUDAGraph.h index 76a090579d1d..b492b3152797 100644 --- a/aten/src/ATen/cuda/CUDAGraph.h +++ b/aten/src/ATen/cuda/CUDAGraph.h @@ -19,7 +19,11 @@ namespace cuda { TORCH_CUDA_CPP_API MempoolId_t graph_pool_handle(); struct TORCH_CUDA_CPP_API CUDAGraph { +<<<<<<< HEAD CUDAGraph(); +======= + CUDAGraph(bool keep_graph=false); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ~CUDAGraph(); // See Note [Explicit Registration of Generators to the CUDA Graph] @@ -29,21 +33,39 @@ struct TORCH_CUDA_CPP_API CUDAGraph { MempoolId_t pool = {0, 0}, cudaStreamCaptureMode capture_mode = cudaStreamCaptureModeGlobal); void capture_end(); +<<<<<<< HEAD +======= + void instantiate(); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) void replay(); void reset(); MempoolId_t pool(); void enable_debug_mode(); void debug_dump(const std::string& debug_path); +<<<<<<< HEAD +======= + cudaGraph_t raw_cuda_graph(); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) protected: cudaGraph_t graph_ = nullptr; cudaGraphExec_t graph_exec_ = nullptr; // internal states so reset() can do its best cleaning up +<<<<<<< HEAD // Set to true in capture_end if cudaStreamEndCapture succeeded // Set back to false soon after, when graph_ is consumed by cudaGraphInstantiate // to create graph_exec_, then graph_ is deleted bool has_graph_ = false; +======= + + // Set to true in capture_end if cudaStreamEndCapture succeeded + // Set back to false after instantiate() unless keep_graph=True or + // enable_debug_mode() was called on any CUDAGraph instance. + bool has_graph_ = false; + // Set to true in capture_end if cudaStreamEndCapture succeeded + bool capture_ended_ = false; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // Set to true in capture_end if cudaGraphInstantiate succeeded bool has_graph_exec_ = false; @@ -80,6 +102,11 @@ struct TORCH_CUDA_CPP_API CUDAGraph { // init capture_dev_ as UNDEFINED_DEVICE to check that it stores the real device id in the destructor static constexpr c10::DeviceIndex UNDEFINED_DEVICE = -1; c10::DeviceIndex capture_dev_{UNDEFINED_DEVICE}; +<<<<<<< HEAD +======= + + bool keep_graph_; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) }; } // namespace cuda diff --git a/aten/src/ATen/cuda/CUDASparseDescriptors.cpp b/aten/src/ATen/cuda/CUDASparseDescriptors.cpp index 84711be2ddf3..0df75f3b39c7 100644 --- a/aten/src/ATen/cuda/CUDASparseDescriptors.cpp +++ b/aten/src/ATen/cuda/CUDASparseDescriptors.cpp @@ -75,12 +75,16 @@ cusparseDnMatDescr_t createRawDnMatDescriptor(const Tensor& input, int64_t batch auto leading_dimension = is_row_major ? input_strides[ndim - 2] : input_strides[ndim - 1]; +<<<<<<< HEAD #if !defined(USE_ROCM) auto order = is_row_major ? CUSPARSE_ORDER_ROW : CUSPARSE_ORDER_COL; #else TORCH_INTERNAL_ASSERT(is_column_major, "Expected column major input."); auto order = CUSPARSE_ORDER_COL; #endif +======= + auto order = is_row_major ? CUSPARSE_ORDER_ROW : CUSPARSE_ORDER_COL; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) auto batch_stride = ndim > 2 && batch_offset >= 0 ? input_strides[ndim - 3] : 0; // NOLINTNEXTLINE(*const-cast) diff --git a/aten/src/ATen/cuda/CachingHostAllocator.cpp b/aten/src/ATen/cuda/CachingHostAllocator.cpp index 8a039ea3bff9..47e948a9acd4 100644 --- a/aten/src/ATen/cuda/CachingHostAllocator.cpp +++ b/aten/src/ATen/cuda/CachingHostAllocator.cpp @@ -9,6 +9,10 @@ #include #include +<<<<<<< HEAD +======= +#include +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) namespace at::cuda { namespace { @@ -71,6 +75,11 @@ using Block = HostBlock; struct CUDACachingHostAllocatorImpl : public CachingHostAllocatorImpl { private: +<<<<<<< HEAD +======= + std::unordered_map use_host_register; + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) void allocate_host_memory(size_t size, void** ptr) override { // Pinned memory pointers allocated by any device can be directly used by // any other device, regardless of the current device at the time of @@ -88,41 +97,82 @@ struct CUDACachingHostAllocatorImpl at::Device(at::DeviceType::CUDA, *primary_ctx_device_index)); } +<<<<<<< HEAD auto start = std::chrono::system_clock::now(); if (c10::cuda::CUDACachingAllocator::CUDAAllocatorConfig:: pinned_use_cuda_host_register()) { +======= + auto start = std::chrono::steady_clock::now(); + bool use_register = c10::cuda::CUDACachingAllocator::CUDAAllocatorConfig::pinned_use_cuda_host_register(); + if (use_register) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) allocWithCudaHostRegister(ptr, size); } else { // Use cudaHostAlloc for allocating pinned memory (global lock in driver) C10_CUDA_CHECK(cudaHostAlloc(ptr, size, cudaHostAllocDefault)); } +<<<<<<< HEAD auto end = std::chrono::system_clock::now(); +======= + + auto end = std::chrono::steady_clock::now(); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) auto duration = std::chrono::duration_cast(end - start); // Update the statistics on the time spent on cudaHostAlloc/hostRegister { std::lock_guard g(stats_.timing_mutex_); +<<<<<<< HEAD +======= + TORCH_INTERNAL_ASSERT_DEBUG_ONLY(use_host_register.count(*ptr) == 0); + use_host_register[*ptr] = use_register; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) stats_.host_alloc_time.increase(duration.count()); } } void free_block(Block* block) override { +<<<<<<< HEAD auto start = std::chrono::system_clock::now(); if (c10::cuda::CUDACachingAllocator::CUDAAllocatorConfig:: pinned_use_cuda_host_register()) { void* ptr = block->ptr_; +======= + auto start = std::chrono::steady_clock::now(); + // Users may change the allocator config at will. torch unit tests do this. + // However, allocations using cudaHostRegister should use corresonding + // cudaHostUnregister and similarly for cudaHostAlloc / cudaFreeHost. + void* ptr = block->ptr_; + bool use_register = false; + { + std::lock_guard g(stats_.timing_mutex_); + TORCH_INTERNAL_ASSERT_DEBUG_ONLY(use_host_register.count(ptr) == 1); + use_register = use_host_register[ptr]; + } + if (use_register) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) AT_CUDA_CHECK(cudaHostUnregister(ptr)); // NOLINTNEXTLINE(cppcoreguidelines-no-malloc) std::free(ptr); } else { +<<<<<<< HEAD AT_CUDA_CHECK(cudaFreeHost(block->ptr_)); } auto end = std::chrono::system_clock::now(); +======= + AT_CUDA_CHECK(cudaFreeHost(ptr)); + } + auto end = std::chrono::steady_clock::now(); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) auto duration = std::chrono::duration_cast(end - start); // Update the statistics on the time spent on cudaFreeHost/hostUnregister { std::lock_guard g(stats_.timing_mutex_); +<<<<<<< HEAD +======= + use_host_register.erase(ptr); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) stats_.host_free_time.increase(duration.count()); } } @@ -185,6 +235,7 @@ struct CUDACachingHostAllocatorImpl } } +<<<<<<< HEAD void registerPages(const void* ptr, size_t size) { AT_CUDA_CHECK( cudaHostRegister((void*)ptr, (size_t)size, cudaHostRegisterDefault)); @@ -200,6 +251,8 @@ struct CUDACachingHostAllocatorImpl ""); } +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) void allocWithCudaHostRegister(void** ptr, size_t roundSize) { // Here we do regular allocation, pre-fault/map the pages, and then do // cudaHostRegister with GPU mapping flags to lock the pages, so we @@ -249,6 +302,7 @@ struct CUDACachingHostAllocatorImpl } // Register the mapped pages using cudaHostRegister +<<<<<<< HEAD registerPages(*ptr, roundSize); } }; @@ -307,4 +361,20 @@ void CachingHostAllocator_resetPeakStats() { return getCUDACachingHostAllocator().resetPeakStats(); } +======= + AT_CUDA_CHECK( + cudaHostRegister(*ptr, roundSize, cudaHostRegisterDefault)); + } +}; + +DECLARE_HOST_ALLOCATOR( + CUDACachingHostAllocator, + CUDACachingHostAllocatorImpl, + raw_local_deleter, + caching_host_allocator); + +REGISTER_HOST_ALLOCATOR(at::kCUDA, &caching_host_allocator) + +} // anonymous namespace +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } // namespace at::cuda diff --git a/aten/src/ATen/cuda/CachingHostAllocator.h b/aten/src/ATen/cuda/CachingHostAllocator.h index 6c33dfaeb534..826c090b04ed 100644 --- a/aten/src/ATen/cuda/CachingHostAllocator.h +++ b/aten/src/ATen/cuda/CachingHostAllocator.h @@ -3,6 +3,10 @@ #include #include #include +<<<<<<< HEAD +======= +#include +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) namespace at::cuda { @@ -18,6 +22,7 @@ namespace at::cuda { // call between host and device, and passed the corresponding context from the // allocation. This is currently invoked by at::native::copy_kernel_cuda. // +<<<<<<< HEAD TORCH_CUDA_CPP_API c10::Allocator* getCachingHostAllocator(); // Records an event in the specified stream. The allocation corresponding to the @@ -38,5 +43,54 @@ TORCH_CUDA_CPP_API at::HostStats CachingHostAllocator_getStats(); TORCH_CUDA_CPP_API void CachingHostAllocator_resetAccumulatedStats(); TORCH_CUDA_CPP_API void CachingHostAllocator_resetPeakStats(); +======= +C10_DEPRECATED_MESSAGE( + "at::cuda::getCachingHostAllocator() is deprecated. Please use at::getHostAllocator(at::kCUDA) instead.") +inline TORCH_CUDA_CPP_API at::HostAllocator* getCachingHostAllocator() { + return at::getHostAllocator(at::kCUDA); +} + +// Records an event in the specified stream. The allocation corresponding to the +// input `ptr`/`ctx` will not be re-used until the event has occurred. +C10_DEPRECATED_MESSAGE( + "at::cuda::CachingHostAllocator_recordEvent(...) is deprecated. Please use at::getHostAllocator(at::kCUDA)->record_event(...) instead.") +inline TORCH_CUDA_CPP_API bool CachingHostAllocator_recordEvent( + void* ptr, + void* ctx, + c10::cuda::CUDAStream stream) { + return getHostAllocator(at::kCUDA)->record_event(ptr, ctx, stream.unwrap()); +} + +// Releases cached pinned memory allocations via cudaHostFree +C10_DEPRECATED_MESSAGE( + "at::cuda::CachingHostAllocator_emptyCache() is deprecated. Please use at::getHostAllocator(at::kCUDA)->empty_cache() instead.") +inline TORCH_CUDA_CPP_API void CachingHostAllocator_emptyCache() { + getHostAllocator(at::kCUDA)->empty_cache(); +} + +C10_DEPRECATED_MESSAGE( + "at::cuda::HostAlloc(...) is deprecated. Please use at::getHostAllocator(at::kCUDA)->allocate(...) instead.") +inline TORCH_CUDA_CPP_API at::DataPtr HostAlloc(size_t size) { + return getHostAllocator(at::kCUDA)->allocate(size); +} + +C10_DEPRECATED_MESSAGE( + "at::cuda::CachingHostAllocator_getStats() is deprecated. Please use at::getHostAllocator(at::kCUDA)->get_stats() instead.") +inline TORCH_CUDA_CPP_API at::HostStats CachingHostAllocator_getStats() { + return getHostAllocator(at::kCUDA)->get_stats(); +} + +C10_DEPRECATED_MESSAGE( + "at::cuda::CachingHostAllocator_resetAccumulatedStats() is deprecated. Please use at::getHostAllocator(at::kCUDA)->reset_accumulated_stats() instead.") +inline TORCH_CUDA_CPP_API void CachingHostAllocator_resetAccumulatedStats() { + getHostAllocator(at::kCUDA)->reset_accumulated_stats(); +} + +C10_DEPRECATED_MESSAGE( + "at::cuda::CachingHostAllocator_resetPeakStats() is deprecated. Please use at::getHostAllocator(at::kCUDA)->reset_peak_stats() instead.") +inline TORCH_CUDA_CPP_API void CachingHostAllocator_resetPeakStats() { + getHostAllocator(at::kCUDA)->reset_peak_stats(); +} +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } // namespace at::cuda diff --git a/aten/src/ATen/cuda/CublasHandlePool.cpp b/aten/src/ATen/cuda/CublasHandlePool.cpp index 2892a286ed62..9053902b3ba0 100644 --- a/aten/src/ATen/cuda/CublasHandlePool.cpp +++ b/aten/src/ATen/cuda/CublasHandlePool.cpp @@ -123,6 +123,7 @@ void clearCublasWorkspaces() { } size_t parseChosenWorkspaceSize() { +<<<<<<< HEAD const char * val = getenv("CUBLAS_WORKSPACE_CONFIG"); #ifdef USE_ROCM if (!val) { @@ -137,6 +138,20 @@ size_t parseChosenWorkspaceSize() { std::string device_arch = properties->gcnArchName; const bool gfx94 = device_arch.find("gfx94") != std::string::npos; const size_t default_size = gfx94 ? 1024 * 128 * 1024 : 1024 * 32 * 1024; +======= + auto val = c10::utils::get_env("CUBLAS_WORKSPACE_CONFIG"); +#ifdef USE_ROCM + if (!val) { + val = c10::utils::get_env("HIPBLAS_WORKSPACE_CONFIG"); + } + if (!val) { + // for extra convenience + val = c10::utils::get_env("ROCBLAS_WORKSPACE_CONFIG"); + } + /* 32MiB default, 128MiB for gfx94x/gfx95x */ + const bool gfx94_95 = at::detail::getCUDAHooks().isGPUArch({"gfx94", "gfx95"}); + const size_t default_size = gfx94_95 ? 1024 * 128 * 1024 : 1024 * 32 * 1024; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #else /* :4096:2:16:8 default, 32MiB for Hopper */ cudaDeviceProp* properties = at::cuda::getCurrentDeviceProperties(); @@ -146,7 +161,11 @@ size_t parseChosenWorkspaceSize() { if (val) { size_t total_size = 0; +<<<<<<< HEAD const std::string config(val); +======= + const std::string& config(val.value()); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) std::regex exp(":([0-9]+):([0-9]+)"); std::sregex_iterator next(config.begin(), config.end(), exp); std::sregex_iterator end; diff --git a/aten/src/ATen/cuda/Exceptions.h b/aten/src/ATen/cuda/Exceptions.h index 7a24151df205..029cd7338dc0 100644 --- a/aten/src/ATen/cuda/Exceptions.h +++ b/aten/src/ATen/cuda/Exceptions.h @@ -117,15 +117,22 @@ constexpr const char* _cusolver_backend_suggestion = \ "linear algebra operators with other supported backends. " \ "See https://pytorch.org/docs/stable/backends.html#torch.backends.cuda.preferred_linalg_library"; +<<<<<<< HEAD // When cuda < 11.5, cusolver raises CUSOLVER_STATUS_EXECUTION_FAILED when input contains nan. +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // When cuda >= 11.5, cusolver normally finishes execution and sets info array indicating convergence issue. #define TORCH_CUSOLVER_CHECK(EXPR) \ do { \ cusolverStatus_t __err = EXPR; \ +<<<<<<< HEAD if ((CUDA_VERSION < 11500 && \ __err == CUSOLVER_STATUS_EXECUTION_FAILED) || \ (CUDA_VERSION >= 11500 && \ __err == CUSOLVER_STATUS_INVALID_VALUE)) { \ +======= + if (__err == CUSOLVER_STATUS_INVALID_VALUE) { \ +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) TORCH_CHECK_LINALG( \ false, \ "cusolver error: ", \ diff --git a/aten/src/ATen/cuda/PinnedMemoryAllocator.h b/aten/src/ATen/cuda/PinnedMemoryAllocator.h index 854f5d8dd129..bbe3687ba24c 100644 --- a/aten/src/ATen/cuda/PinnedMemoryAllocator.h +++ b/aten/src/ATen/cuda/PinnedMemoryAllocator.h @@ -1,11 +1,19 @@ #pragma once +<<<<<<< HEAD #include +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #include namespace at::cuda { +<<<<<<< HEAD inline TORCH_CUDA_CPP_API at::Allocator* getPinnedMemoryAllocator() { return getCachingHostAllocator(); +======= +inline TORCH_CUDA_CPP_API at::HostAllocator* getPinnedMemoryAllocator() { + return at::getHostAllocator(at::kCUDA); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } } // namespace at::cuda diff --git a/aten/src/ATen/cuda/cub-RadixSortPairs-f16-8.cu b/aten/src/ATen/cuda/cub-RadixSortPairs-f16-8.cu new file mode 100644 index 000000000000..6c20daed2e02 --- /dev/null +++ b/aten/src/ATen/cuda/cub-RadixSortPairs-f16-8.cu @@ -0,0 +1,7 @@ +#include + +namespace at::cuda::cub::detail { + +AT_INSTANTIATE_SORT_PAIRS(c10::BFloat16, 8) + +} // namespace at::cuda::cub::detail diff --git a/aten/src/ATen/cuda/cub-RadixSortPairs-int32-1.cu b/aten/src/ATen/cuda/cub-RadixSortPairs-int32-1.cu new file mode 100644 index 000000000000..2adb6a519882 --- /dev/null +++ b/aten/src/ATen/cuda/cub-RadixSortPairs-int32-1.cu @@ -0,0 +1,7 @@ +#include + +namespace at::cuda::cub::detail { + +AT_INSTANTIATE_SORT_PAIRS(int32_t, 1) + +} // namespace at::cuda::cub::detail diff --git a/aten/src/ATen/cuda/cub-RadixSortPairs-int32-2.cu b/aten/src/ATen/cuda/cub-RadixSortPairs-int32-2.cu new file mode 100644 index 000000000000..39e29b7668c9 --- /dev/null +++ b/aten/src/ATen/cuda/cub-RadixSortPairs-int32-2.cu @@ -0,0 +1,7 @@ +#include + +namespace at::cuda::cub::detail { + +AT_INSTANTIATE_SORT_PAIRS(int32_t, 2) + +} // namespace at::cuda::cub::detail diff --git a/aten/src/ATen/cuda/cub-RadixSortPairs-int32-4.cu b/aten/src/ATen/cuda/cub-RadixSortPairs-int32-4.cu new file mode 100644 index 000000000000..3ad1ebd2a56a --- /dev/null +++ b/aten/src/ATen/cuda/cub-RadixSortPairs-int32-4.cu @@ -0,0 +1,7 @@ +#include + +namespace at::cuda::cub::detail { + +AT_INSTANTIATE_SORT_PAIRS(int32_t, 4) + +} // namespace at::cuda::cub::detail diff --git a/aten/src/ATen/cuda/cub-RadixSortPairs-int64-1.cu b/aten/src/ATen/cuda/cub-RadixSortPairs-int64-1.cu new file mode 100644 index 000000000000..098615b68345 --- /dev/null +++ b/aten/src/ATen/cuda/cub-RadixSortPairs-int64-1.cu @@ -0,0 +1,7 @@ +#include + +namespace at::cuda::cub::detail { + +AT_INSTANTIATE_SORT_PAIRS(int64_t, 1) + +} // namespace at::cuda::cub::detail diff --git a/aten/src/ATen/cuda/cub-RadixSortPairs-int64-2.cu b/aten/src/ATen/cuda/cub-RadixSortPairs-int64-2.cu new file mode 100644 index 000000000000..d58e0c8d5ce7 --- /dev/null +++ b/aten/src/ATen/cuda/cub-RadixSortPairs-int64-2.cu @@ -0,0 +1,7 @@ +#include + +namespace at::cuda::cub::detail { + +AT_INSTANTIATE_SORT_PAIRS(int64_t, 2) + +} // namespace at::cuda::cub::detail diff --git a/aten/src/ATen/cuda/cub-RadixSortPairs-int64-4.cu b/aten/src/ATen/cuda/cub-RadixSortPairs-int64-4.cu new file mode 100644 index 000000000000..fe24f72151fb --- /dev/null +++ b/aten/src/ATen/cuda/cub-RadixSortPairs-int64-4.cu @@ -0,0 +1,7 @@ +#include + +namespace at::cuda::cub::detail { + +AT_INSTANTIATE_SORT_PAIRS(int64_t, 4) + +} // namespace at::cuda::cub::detail diff --git a/aten/src/ATen/cuda/cub-RadixSortPairs-scalars.cu b/aten/src/ATen/cuda/cub-RadixSortPairs-scalars.cu new file mode 100644 index 000000000000..1373668316c2 --- /dev/null +++ b/aten/src/ATen/cuda/cub-RadixSortPairs-scalars.cu @@ -0,0 +1,7 @@ +#include + +namespace at::cuda::cub::detail { + +AT_FORALL_SCALAR_TYPES_AND2(Bool, Half, AT_INSTANTIATE_SORT_PAIRS_8) + +} // namespace at::cuda::cub::detail diff --git a/aten/src/ATen/cuda/cub-RadixSortPairs-uint16-8.cu b/aten/src/ATen/cuda/cub-RadixSortPairs-uint16-8.cu new file mode 100644 index 000000000000..f52f97fe588a --- /dev/null +++ b/aten/src/ATen/cuda/cub-RadixSortPairs-uint16-8.cu @@ -0,0 +1,7 @@ +#include + +namespace at::cuda::cub::detail { + +AT_INSTANTIATE_SORT_PAIRS(uint16_t, 8) + +} // namespace at::cuda::cub::detail diff --git a/aten/src/ATen/cuda/cub-RadixSortPairs-uint32-8.cu b/aten/src/ATen/cuda/cub-RadixSortPairs-uint32-8.cu new file mode 100644 index 000000000000..db28bb602acc --- /dev/null +++ b/aten/src/ATen/cuda/cub-RadixSortPairs-uint32-8.cu @@ -0,0 +1,7 @@ +#include + +namespace at::cuda::cub::detail { + +AT_INSTANTIATE_SORT_PAIRS(uint32_t, 8) + +} // namespace at::cuda::cub::detail diff --git a/aten/src/ATen/cuda/cub-RadixSortPairs-uint64-8.cu b/aten/src/ATen/cuda/cub-RadixSortPairs-uint64-8.cu new file mode 100644 index 000000000000..7ad51b90b834 --- /dev/null +++ b/aten/src/ATen/cuda/cub-RadixSortPairs-uint64-8.cu @@ -0,0 +1,7 @@ +#include + +namespace at::cuda::cub::detail { + +AT_INSTANTIATE_SORT_PAIRS(uint64_t, 8) + +} // namespace at::cuda::cub::detail diff --git a/aten/src/ATen/cuda/cub-RadixSortPairs.cuh b/aten/src/ATen/cuda/cub-RadixSortPairs.cuh new file mode 100644 index 000000000000..bd40deb4125b --- /dev/null +++ b/aten/src/ATen/cuda/cub-RadixSortPairs.cuh @@ -0,0 +1,74 @@ +#pragma once + +#define TORCH_ASSERT_NO_OPERATORS +#include +#include + +namespace at::cuda::cub::detail { + +template +void radix_sort_pairs_impl( + const key_t* keys_in, + key_t* keys_out, + const OpaqueType* values_in, + OpaqueType* values_out, + int64_t n, + bool descending, + int64_t begin_bit, + int64_t end_bit) { + TORCH_CHECK( + n <= std::numeric_limits::max(), + "cub sort does not support sorting more than INT_MAX elements"); + using key_t_ = typename detail::cuda_type::type; + + auto allocator = c10::cuda::CUDACachingAllocator::get(); + c10::DataPtr keys_out_owner; + + if (keys_out == nullptr) { + keys_out_owner = allocator->allocate(n * sizeof(key_t)); + keys_out = reinterpret_cast(keys_out_owner.get()); + } + + const key_t_* keys_in_ = reinterpret_cast(keys_in); + key_t_* keys_out_ = reinterpret_cast(keys_out); + + if (descending) { + CUB_WRAPPER( + NO_ROCM(at_cuda_detail)::cub::DeviceRadixSort::SortPairsDescending, + keys_in_, + keys_out_, + values_in, + values_out, + n, + begin_bit, + end_bit, + c10::cuda::getCurrentCUDAStream()); + } else { + CUB_WRAPPER( + NO_ROCM(at_cuda_detail)::cub::DeviceRadixSort::SortPairs, + keys_in_, + keys_out_, + values_in, + values_out, + n, + begin_bit, + end_bit, + c10::cuda::getCurrentCUDAStream()); + } +} + +#define AT_INSTANTIATE_SORT_PAIRS(key_t, value_size) \ + template void radix_sort_pairs_impl( \ + const key_t* keys_in, \ + key_t* keys_out, \ + const OpaqueType* values_in, \ + OpaqueType* values_out, \ + int64_t n, \ + bool descending, \ + int64_t begin_bit, \ + int64_t end_bit); + +#define AT_INSTANTIATE_SORT_PAIRS_8(scalar_t, ScalarType) \ + AT_INSTANTIATE_SORT_PAIRS(scalar_t, 8) + +} // namespace at::cuda::cub::detail diff --git a/aten/src/ATen/cuda/cub.cuh b/aten/src/ATen/cuda/cub.cuh index a1a7ab70630b..96df8d77554b 100644 --- a/aten/src/ATen/cuda/cub.cuh +++ b/aten/src/ATen/cuda/cub.cuh @@ -37,11 +37,18 @@ // handle the temporary storage and 'twice' calls for cub API #define CUB_WRAPPER(func, ...) do { \ size_t temp_storage_bytes = 0; \ +<<<<<<< HEAD func(nullptr, temp_storage_bytes, __VA_ARGS__); \ auto& caching_allocator = *::c10::cuda::CUDACachingAllocator::get(); \ auto temp_storage = caching_allocator.allocate(temp_storage_bytes); \ func(temp_storage.get(), temp_storage_bytes, __VA_ARGS__); \ AT_CUDA_CHECK(cudaGetLastError()); \ +======= + AT_CUDA_CHECK(func(nullptr, temp_storage_bytes, __VA_ARGS__)); \ + auto& caching_allocator = *::c10::cuda::CUDACachingAllocator::get(); \ + auto temp_storage = caching_allocator.allocate(temp_storage_bytes); \ + AT_CUDA_CHECK(func(temp_storage.get(), temp_storage_bytes, __VA_ARGS__));\ +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } while (false) #ifdef USE_ROCM @@ -292,7 +299,11 @@ inline void inclusive_scan(InputIteratorT input, OutputIteratorT output, ScanOpT #endif } +<<<<<<< HEAD # if (defined(CUDA_VERSION) && CUDA_VERSION > 11040) || defined(USE_ROCM) +======= +# if defined(CUDA_VERSION) || defined(USE_ROCM) +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) template struct BlockPrefixCallbackOp diff --git a/aten/src/ATen/cuda/detail/CUDAHooks.cpp b/aten/src/ATen/cuda/detail/CUDAHooks.cpp index 21484c0dea9a..4645259cde4c 100644 --- a/aten/src/ATen/cuda/detail/CUDAHooks.cpp +++ b/aten/src/ATen/cuda/detail/CUDAHooks.cpp @@ -19,6 +19,13 @@ #include #include +<<<<<<< HEAD +======= +#if !defined(USE_ROCM) && defined(PYTORCH_C10_DRIVER_API_SUPPORTED) +#include +#endif + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #if AT_CUDNN_ENABLED() #include #endif @@ -91,7 +98,31 @@ void CUDAHooks::init() const { // Sets the CUDA_MODULE_LOADING environment variable // if it's not set by the user. +<<<<<<< HEAD c10::utils::set_env("CUDA_MODULE_LOADING", "LAZY", false); +======= + // CUDA_MODULE_LOADING="LAZY" is default for all drivers released for CUDA 12.2+. + // Check the driver version and only set the env variable if needed. + bool set_lazy_module_loading = true; + #if !defined(USE_ROCM) && defined(PYTORCH_C10_DRIVER_API_SUPPORTED) + auto driver_api = c10::cuda::DriverAPI::get(); + // Initialize NVML + if (driver_api->nvmlInit_v2_() == NVML_SUCCESS) { + // Get the driver version + int version = -1; + auto res = driver_api->nvmlSystemGetCudaDriverVersion_v2_(&version); + if (res == NVML_SUCCESS) { + // Check if driver is sufficiently new + if (version >= 12020) { + set_lazy_module_loading = false; + } + } + } + #endif + if (set_lazy_module_loading) { + c10::utils::set_env("CUDA_MODULE_LOADING", "LAZY", false); + } +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) const auto num_devices = c10::cuda::device_count_ensure_non_zero(); c10::cuda::CUDACachingAllocator::init(num_devices); at::cuda::detail::init_p2p_access_cache(num_devices); @@ -458,8 +489,19 @@ DeviceIndex CUDAHooks::getCurrentDevice() const { } #ifdef USE_ROCM +<<<<<<< HEAD bool CUDAHooks::isGPUArch(DeviceIndex device_index, const std::vector& archs) const { hipDeviceProp_t* prop = at::cuda::getDeviceProperties(device_index); +======= +bool CUDAHooks::isGPUArch(const std::vector& archs, DeviceIndex device_index) const { + hipDeviceProp_t* prop; + if (device_index == -1){ + prop = at::cuda::getCurrentDeviceProperties(); + } else { + prop = at::cuda::getDeviceProperties(device_index); + } + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) std::string device_arch = prop->gcnArchName; for (std::string arch : archs) { size_t substring = device_arch.find(arch); diff --git a/aten/src/ATen/cuda/detail/CUDAHooks.h b/aten/src/ATen/cuda/detail/CUDAHooks.h index 34f2adee5140..84837dd871d9 100644 --- a/aten/src/ATen/cuda/detail/CUDAHooks.h +++ b/aten/src/ATen/cuda/detail/CUDAHooks.h @@ -58,7 +58,11 @@ struct CUDAHooks : public at::CUDAHooksInterface { DeviceIndex getCurrentDevice() const override; #ifdef USE_ROCM +<<<<<<< HEAD bool isGPUArch(DeviceIndex device_index, const std::vector& archs) const override; +======= + bool isGPUArch(const std::vector& archs, DeviceIndex device_index = -1) const override; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #endif void deviceSynchronize(DeviceIndex device_index) const override; }; diff --git a/aten/src/ATen/cuda/detail/LazyNVRTC.cpp b/aten/src/ATen/cuda/detail/LazyNVRTC.cpp index c9cabeb9399f..92920db451dd 100644 --- a/aten/src/ATen/cuda/detail/LazyNVRTC.cpp +++ b/aten/src/ATen/cuda/detail/LazyNVRTC.cpp @@ -146,17 +146,30 @@ nvrtcResult nvrtcCreateProgram(nvrtcProgram *prog, NVRTC_STUB1(nvrtcDestroyProgram, nvrtcProgram *) NVRTC_STUB2(nvrtcGetPTXSize, nvrtcProgram, size_t *) NVRTC_STUB2(nvrtcGetPTX, nvrtcProgram, char *) +<<<<<<< HEAD #if defined(CUDA_VERSION) && CUDA_VERSION >= 11010 NVRTC_STUB2(nvrtcGetCUBINSize, nvrtcProgram, size_t *) NVRTC_STUB2(nvrtcGetCUBIN, nvrtcProgram, char *) #endif +======= +NVRTC_STUB2(nvrtcGetCUBINSize, nvrtcProgram, size_t *) +NVRTC_STUB2(nvrtcGetCUBIN, nvrtcProgram, char *) +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) NVRTC_STUB3(nvrtcCompileProgram, nvrtcProgram, int, const char * const *) _STUB_1(NVRTC, nvrtcGetErrorString, const char *, nvrtcResult) NVRTC_STUB2(nvrtcGetProgramLogSize,nvrtcProgram, size_t*) NVRTC_STUB2(nvrtcGetProgramLog, nvrtcProgram, char *) NVRTC_STUB3(nvrtcGetLoweredName, nvrtcProgram, const char *, const char **) +<<<<<<< HEAD CUDA_STUB2(cuModuleLoadData, CUmodule *, const void *) +======= +CUDA_STUB2(cuModuleLoad, CUmodule*, const char*) +CUDA_STUB2(cuModuleLoadData, CUmodule *, const void *) +CUDA_STUB2(cuFuncSetCacheConfig, CUfunction, CUfunc_cache_enum) +CUDA_STUB3(cuDeviceGetAttribute, int*, CUdevice_attribute_enum, CUdevice) +CUDA_STUB2(cuDeviceGet, CUdevice*, int) +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) CUDA_STUB3(cuModuleGetFunction, CUfunction *, CUmodule, const char *) CUDA_STUB4(cuOccupancyMaxActiveBlocksPerMultiprocessor, int *, CUfunction, int, size_t) CUDA_STUB2(cuGetErrorString, CUresult, const char **) @@ -169,6 +182,11 @@ CUDA_STUB4(cuLinkCreate, unsigned int, CUjit_option *, void **, CUlinkState *) CUDA_STUB3(cuLinkComplete, CUlinkState, void **, size_t *) CUDA_STUB3(cuFuncSetAttribute, CUfunction, CUfunction_attribute, int) CUDA_STUB3(cuFuncGetAttribute, int*, CUfunction_attribute, CUfunction) +<<<<<<< HEAD +======= +CUDA_STUB3(cuPointerGetAttribute, void*, CUpointer_attribute, CUdeviceptr) + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #if defined(CUDA_VERSION) && CUDA_VERSION >= 12000 CUresult CUDAAPI diff --git a/aten/src/ATen/cuda/nvrtc_stub/ATenNVRTC.h b/aten/src/ATen/cuda/nvrtc_stub/ATenNVRTC.h index f2e2a0cef55a..689edd8a16eb 100644 --- a/aten/src/ATen/cuda/nvrtc_stub/ATenNVRTC.h +++ b/aten/src/ATen/cuda/nvrtc_stub/ATenNVRTC.h @@ -43,6 +43,10 @@ namespace at::cuda { _(nvrtcGetProgramLogSize) \ _(nvrtcGetProgramLog) \ _(nvrtcGetLoweredName) \ +<<<<<<< HEAD +======= + _(cuModuleLoad) \ +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) _(cuModuleLoadData) \ _(cuModuleLoadDataEx) \ _(cuModuleGetFunction) \ @@ -60,6 +64,14 @@ namespace at::cuda { _(cuLinkComplete) \ _(cuFuncSetAttribute) \ _(cuFuncGetAttribute) \ +<<<<<<< HEAD +======= + _(cuPointerGetAttribute) \ + _(cuFuncSetCacheConfig) \ + _(cuDeviceGetAttribute) \ + _(cuDeviceGet) \ + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #if defined(CUDA_VERSION) && CUDA_VERSION >= 12000 #define AT_FORALL_NVRTC_EXTENDED(_) \ @@ -70,7 +82,11 @@ namespace at::cuda { AT_FORALL_NVRTC_BASE(_) #endif +<<<<<<< HEAD #if defined(CUDA_VERSION) && CUDA_VERSION >= 11010 +======= +#if defined(CUDA_VERSION) +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #define AT_FORALL_NVRTC(_) \ AT_FORALL_NVRTC_EXTENDED(_) \ _(nvrtcGetCUBINSize) \ diff --git a/aten/src/ATen/cuda/tunable/GemmCommon.h b/aten/src/ATen/cuda/tunable/GemmCommon.h index 5ed30f74b989..21d85a031e65 100644 --- a/aten/src/ATen/cuda/tunable/GemmCommon.h +++ b/aten/src/ATen/cuda/tunable/GemmCommon.h @@ -469,7 +469,11 @@ struct GemmAndBiasParams : OpParams { bool duplicate_inputs_{false}; }; +<<<<<<< HEAD template +======= +template +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) struct GemmStridedBatchedParams : OpParams { std::string BLASSignature() const override { std::string alpha_str = to_string_opmath(alpha); @@ -477,7 +481,11 @@ struct GemmStridedBatchedParams : OpParams { return fmt::sprintf("- { function: matmul, M: %ld, N: %ld, K: %ld, lda: %ld, ldb: %ld, ldc: %ld, ldd: %ld, stride_a: %ld, stride_b: %ld, stride_c: %ld, stride_d: %ld, " "alpha: %s, beta: %s, transA: %c, transB: %c, batch_count: %ld, a_type: %s, b_type: %s, c_type: %s, d_type: %s, scale_type: %s, compute_type: %s }", m, n, k, lda, ldb, ldc, ldc, stride_a, stride_b, stride_c, stride_c, alpha_str, beta_str, transa, transb, batch, +<<<<<<< HEAD BLASTypeName(T{}), BLASTypeName(T{}), BLASTypeName(T{}), BLASTypeName(T{}), ComputeTypeFor(), ComputeTypeFor()); +======= + BLASTypeName(T{}), BLASTypeName(T{}), BLASTypeName(C_Dtype{}), BLASTypeName(T{}), ComputeTypeFor(), ComputeTypeFor()); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } std::string Signature() const override { @@ -517,7 +525,11 @@ struct GemmStridedBatchedParams : OpParams { c10::DeviceIndex device = 0; AT_CUDA_CHECK(c10::cuda::GetDevice(&device)); size_t c_size = GetSizeC(); +<<<<<<< HEAD copy->c = static_cast(c10::cuda::CUDACachingAllocator::raw_alloc(c_size)); +======= + copy->c = static_cast(c10::cuda::CUDACachingAllocator::raw_alloc(c_size)); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) AT_CUDA_CHECK(c10::cuda::CUDACachingAllocator::memcpyAsync( copy->c, device, c, device, c_size, getCurrentCUDAStream(device), true)); if (duplicate_inputs) { @@ -544,7 +556,11 @@ struct GemmStridedBatchedParams : OpParams { } TuningStatus NumericalCheck(GemmStridedBatchedParams *other) { +<<<<<<< HEAD auto c_dtype = c10::CppTypeToScalarType::value; +======= + auto c_dtype = c10::CppTypeToScalarType::value; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return detail::NumericalCheck(c_dtype, c, other->c, GetSizeC()/sizeof(T)) ? OK : FAIL; } @@ -561,7 +577,11 @@ struct GemmStridedBatchedParams : OpParams { int64_t ldb{}; int64_t stride_b{}; at::opmath_type beta; +<<<<<<< HEAD T* c{}; +======= + C_Dtype* c{}; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) int64_t ldc{}; int64_t stride_c{}; int64_t batch{}; diff --git a/aten/src/ATen/cuda/tunable/GemmHipblaslt.h b/aten/src/ATen/cuda/tunable/GemmHipblaslt.h index b899efde0e9c..863907c47595 100644 --- a/aten/src/ATen/cuda/tunable/GemmHipblaslt.h +++ b/aten/src/ATen/cuda/tunable/GemmHipblaslt.h @@ -85,6 +85,18 @@ constexpr hipDataType HipDataTypeFor() { return static_cast(500); } +<<<<<<< HEAD +======= +template <> +constexpr hipDataType HipDataTypeFor() { +#if ROCM_VERSION >= 70000 + return HIP_R_4F_E2M1; +#else + return static_cast(33); +#endif +} + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) template int GetBatchFromParams(const GemmParams* params) { return 1; @@ -591,6 +603,17 @@ auto GetHipBlasLtTypeStringAndOps() { auto b_datatype = HipDataTypeFor(); auto in_out_datatype = HipDataTypeFor(); std::vector heuristic_result; +<<<<<<< HEAD +======= +#if ROCM_VERSION == 60400 + // hipblaslt TT fp32 regression on ROCm 6.4, cannot use + if ((a_datatype == HIP_R_32F || b_datatype == HIP_R_32F || in_out_datatype == HIP_R_32F) + && (transa_outer == HIPBLAS_OP_T && transb_outer == HIPBLAS_OP_T)) { + std::vector>>> ignore; + return ignore; + } +#endif +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) hipblasComputeType_t computeType = HIPBLAS_COMPUTE_32F; if (at::globalContext().allowTF32CuBLAS()) { diff --git a/aten/src/ATen/cuda/tunable/Tunable.cpp b/aten/src/ATen/cuda/tunable/Tunable.cpp index 71ac97e66688..616e80c6e651 100644 --- a/aten/src/ATen/cuda/tunable/Tunable.cpp +++ b/aten/src/ATen/cuda/tunable/Tunable.cpp @@ -524,8 +524,13 @@ void TuningContext::EnableNumericsCheck(bool value) { } bool TuningContext::IsNumericsCheckEnabled() const { +<<<<<<< HEAD const char *env = getenv("PYTORCH_TUNABLEOP_NUMERICAL_CHECK"); if (env != nullptr && strcmp(env, "1") == 0) { +======= + const auto env = c10::utils::get_env("PYTORCH_TUNABLEOP_NUMERICAL_CHECK"); + if (env == "1") { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return true; } return numerics_check_enable_; diff --git a/aten/src/ATen/cuda/tunable/Tunable.h b/aten/src/ATen/cuda/tunable/Tunable.h index b8187b4254bf..5f8e65ea6224 100644 --- a/aten/src/ATen/cuda/tunable/Tunable.h +++ b/aten/src/ATen/cuda/tunable/Tunable.h @@ -40,9 +40,15 @@ enum TORCH_CUDA_CPP_API TuningStatus { class TORCH_CUDA_CPP_API ResultEntry { public: explicit ResultEntry(std::string key, double time) : key_(std::move(key)), time_(time) {} +<<<<<<< HEAD explicit ResultEntry(std::string key, double time, const std::string& blas_sig ) : key_(std::move(key)), time_(time), blas_sig_(blas_sig) {} bool operator==(const ResultEntry& other) { return key_ == other.key_; } bool operator!=(const ResultEntry& other) { return key_ != other.key_; } +======= + explicit ResultEntry(std::string key, double time, std::string blas_sig ) : key_(std::move(key)), time_(time), blas_sig_(std::move(blas_sig)) {} + bool operator==(const ResultEntry& other) const { return key_ == other.key_; } + bool operator!=(const ResultEntry& other) const { return key_ != other.key_; } +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) operator std::string () { return key_; } std::string GetKey() const { return key_; } double GetTime() const { return time_; } diff --git a/aten/src/ATen/cudnn/Descriptors.cpp b/aten/src/ATen/cudnn/Descriptors.cpp index d7c32ac2cf33..b40f9d022f64 100644 --- a/aten/src/ATen/cudnn/Descriptors.cpp +++ b/aten/src/ATen/cudnn/Descriptors.cpp @@ -156,8 +156,12 @@ void FilterDescriptor::set(const at::Tensor &t, const at::MemoryFormat memory_fo default: TORCH_INTERNAL_ASSERT(false, "unsupported memory_format for cuDNN filters"); } +<<<<<<< HEAD // NOLINTNEXTLINE(*narrowing-conversions) set(getDataType(t), static_cast(dim), size, filter_format); +======= + set(getDataType(t), static_cast(dim), size, filter_format); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } std::string cudnnMemoryFormatToString(cudnnTensorFormat_t tformat) { diff --git a/aten/src/ATen/cudnn/README.md b/aten/src/ATen/cudnn/README.md index 057fbc92ecb0..8f4de9e61745 100644 --- a/aten/src/ATen/cudnn/README.md +++ b/aten/src/ATen/cudnn/README.md @@ -1,4 +1,8 @@ All files living in this directory are written with the assumption that cuDNN is available, which means that these code are not guarded by `#if AT_CUDNN_ENABLED()`. Therefore, whenever you need to use definitions from here, please guard the `#include` and +<<<<<<< HEAD definition usages with `#if AT_CUDNN_ENABLED()` macro, e.g. [native/cudnn/BatchNorm.cpp](native/cudnn/BatchNorm.cpp). +======= +definition usages with `#if AT_CUDNN_ENABLED()` macro, e.g. [native/cudnn/BatchNorm.cpp](../native/cudnn/BatchNorm.cpp). +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) diff --git a/aten/src/ATen/detail/CUDAHooksInterface.h b/aten/src/ATen/detail/CUDAHooksInterface.h index 384d8dfe980f..118a02640967 100644 --- a/aten/src/ATen/detail/CUDAHooksInterface.h +++ b/aten/src/ATen/detail/CUDAHooksInterface.h @@ -86,7 +86,11 @@ struct TORCH_API CUDAHooksInterface : AcceleratorHooksInterface { TORCH_CHECK(false, "Cannot get device of pointer on CUDA without ATen_cuda library. ", CUDA_HELP); } +<<<<<<< HEAD bool isPinnedPtr(const void* data) const override { +======= + bool isPinnedPtr(const void* /*data*/) const override { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return false; } @@ -200,7 +204,11 @@ struct TORCH_API CUDAHooksInterface : AcceleratorHooksInterface { } #ifdef USE_ROCM +<<<<<<< HEAD virtual bool isGPUArch(DeviceIndex /*device_index*/, const std::vector& /*archs*/) const { +======= + virtual bool isGPUArch(const std::vector& /*archs*/, DeviceIndex = -1 /*device_index*/) const { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) TORCH_CHECK(false, "Cannot check GPU arch without ATen_cuda library. ", CUDA_HELP); } #endif diff --git a/aten/src/ATen/detail/HIPHooksInterface.h b/aten/src/ATen/detail/HIPHooksInterface.h index e19a379efbda..f63b6736a8b8 100644 --- a/aten/src/ATen/detail/HIPHooksInterface.h +++ b/aten/src/ATen/detail/HIPHooksInterface.h @@ -6,8 +6,11 @@ #include +<<<<<<< HEAD #include +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // NB: Class must live in `at` due to limitations of Registry.h. namespace at { @@ -37,7 +40,11 @@ struct TORCH_API HIPHooksInterface : AcceleratorHooksInterface { return -1; } +<<<<<<< HEAD bool isPinnedPtr(const void* data) const override { +======= + bool isPinnedPtr(const void* /*data*/ ) const override { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return false; } @@ -49,7 +56,11 @@ struct TORCH_API HIPHooksInterface : AcceleratorHooksInterface { return 0; } +<<<<<<< HEAD bool hasPrimaryContext(DeviceIndex device_index) const override { +======= + bool hasPrimaryContext(DeviceIndex /*device_index*/ ) const override { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) TORCH_CHECK(false, "Cannot check primary context without ATen_hip library."); } }; diff --git a/aten/src/ATen/detail/IPUHooksInterface.h b/aten/src/ATen/detail/IPUHooksInterface.h index ee29aa352f3d..f86fb42db259 100644 --- a/aten/src/ATen/detail/IPUHooksInterface.h +++ b/aten/src/ATen/detail/IPUHooksInterface.h @@ -15,7 +15,11 @@ struct TORCH_API IPUHooksInterface : AcceleratorHooksInterface { TORCH_CHECK(false, "Cannot initialize IPU without ATen_ipu library."); } +<<<<<<< HEAD bool hasPrimaryContext(DeviceIndex device_index) const override { +======= + bool hasPrimaryContext(DeviceIndex /*device_index*/) const override { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) TORCH_CHECK(false, "Cannot initialize IPU without ATen_ipu library."); return false; } @@ -26,7 +30,11 @@ struct TORCH_API IPUHooksInterface : AcceleratorHooksInterface { } Generator getNewGenerator( +<<<<<<< HEAD DeviceIndex device_index [[maybe_unused]] = -1) const override { +======= + DeviceIndex /*device_index*/ = -1) const override { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) TORCH_CHECK(false, "Cannot initialize IPU without ATen_ipu library."); } }; diff --git a/aten/src/ATen/detail/MAIAHooksInterface.h b/aten/src/ATen/detail/MAIAHooksInterface.h index 554cc93043fd..cf4d09832002 100644 --- a/aten/src/ATen/detail/MAIAHooksInterface.h +++ b/aten/src/ATen/detail/MAIAHooksInterface.h @@ -17,7 +17,11 @@ struct TORCH_API MAIAHooksInterface : AcceleratorHooksInterface { TORCH_CHECK(false, "Cannot initialize MAIA without ATen_maia library."); } +<<<<<<< HEAD bool hasPrimaryContext(DeviceIndex device_index) const override { +======= + bool hasPrimaryContext(DeviceIndex /*device_index*/) const override { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) TORCH_CHECK(false, "Cannot initialize MAIA without ATen_maia library."); return false; } diff --git a/aten/src/ATen/detail/MPSHooksInterface.h b/aten/src/ATen/detail/MPSHooksInterface.h index 01d6281e8afe..3fd0f71a2c3e 100644 --- a/aten/src/ATen/detail/MPSHooksInterface.h +++ b/aten/src/ATen/detail/MPSHooksInterface.h @@ -78,6 +78,12 @@ struct TORCH_API MPSHooksInterface : AcceleratorHooksInterface { virtual uint32_t acquireEvent(bool enable_timing) const { FAIL_MPSHOOKS_FUNC(__func__); } +<<<<<<< HEAD +======= + Device getDeviceFromPtr(void* data) const override { + TORCH_CHECK(false, "Cannot get device of pointer on MPS without ATen_mps library. "); + } +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) virtual void releaseEvent(uint32_t event_id) const { FAIL_MPSHOOKS_FUNC(__func__); } diff --git a/aten/src/ATen/detail/MTIAHooksInterface.h b/aten/src/ATen/detail/MTIAHooksInterface.h index b69e0027ea13..4bd62b9ffa4a 100644 --- a/aten/src/ATen/detail/MTIAHooksInterface.h +++ b/aten/src/ATen/detail/MTIAHooksInterface.h @@ -12,7 +12,10 @@ #include #include +<<<<<<< HEAD C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wunused-parameter") +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) namespace at { class Context; } @@ -46,7 +49,11 @@ struct TORCH_API MTIAHooksInterface : AcceleratorHooksInterface { return 0; } +<<<<<<< HEAD virtual void deviceSynchronize(c10::DeviceIndex device_index) const { +======= + virtual void deviceSynchronize(c10::DeviceIndex /*device_index*/) const { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) FAIL_MTIAHOOKS_FUNC(__func__); } @@ -54,11 +61,19 @@ struct TORCH_API MTIAHooksInterface : AcceleratorHooksInterface { FAIL_MTIAHOOKS_FUNC(__func__); } +<<<<<<< HEAD bool hasPrimaryContext(DeviceIndex device_index) const override { return false; } void setCurrentDevice(DeviceIndex device) const override { +======= + bool hasPrimaryContext(DeviceIndex /*device_index*/) const override { + return false; + } + + void setCurrentDevice(DeviceIndex /*device*/) const override { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) FAIL_MTIAHOOKS_FUNC(__func__); } @@ -67,31 +82,60 @@ struct TORCH_API MTIAHooksInterface : AcceleratorHooksInterface { return -1; } +<<<<<<< HEAD DeviceIndex exchangeDevice(DeviceIndex device) const override { +======= + DeviceIndex exchangeDevice(DeviceIndex /*device*/) const override { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) FAIL_MTIAHOOKS_FUNC(__func__); return -1; } +<<<<<<< HEAD DeviceIndex maybeExchangeDevice(DeviceIndex device) const override { +======= + DeviceIndex maybeExchangeDevice(DeviceIndex /*device*/) const override { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) FAIL_MTIAHOOKS_FUNC(__func__); return -1; } +<<<<<<< HEAD virtual c10::Stream getCurrentStream(DeviceIndex device) const { +======= + virtual c10::Stream getCurrentStream(DeviceIndex /*device*/) const { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) FAIL_MTIAHOOKS_FUNC(__func__); return c10::Stream::unpack3(-1, 0, c10::DeviceType::MTIA); } +<<<<<<< HEAD virtual c10::Stream getDefaultStream(DeviceIndex device) const { +======= + virtual int64_t getCurrentRawStream(DeviceIndex /*device*/) const { + FAIL_MTIAHOOKS_FUNC(__func__); + return -1; + } + + virtual c10::Stream getDefaultStream(DeviceIndex /*device*/) const { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) FAIL_MTIAHOOKS_FUNC(__func__); return c10::Stream::unpack3(-1, 0, c10::DeviceType::MTIA); } +<<<<<<< HEAD virtual void setCurrentStream(const c10::Stream& stream) const { FAIL_MTIAHOOKS_FUNC(__func__); } bool isPinnedPtr(const void* data) const override { +======= + virtual void setCurrentStream(const c10::Stream& /*stream*/ ) const { + FAIL_MTIAHOOKS_FUNC(__func__); + } + + bool isPinnedPtr(const void* /*data*/) const override { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return false; } @@ -100,12 +144,25 @@ struct TORCH_API MTIAHooksInterface : AcceleratorHooksInterface { return nullptr; } +<<<<<<< HEAD virtual PyObject* memoryStats(DeviceIndex device) const { +======= + virtual PyObject* memoryStats(DeviceIndex /*device*/) const { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) FAIL_MTIAHOOKS_FUNC(__func__); return nullptr; } +<<<<<<< HEAD virtual PyObject* getDeviceCapability(DeviceIndex device) const { +======= + virtual PyObject* getDeviceCapability(DeviceIndex /*device*/) const { + FAIL_MTIAHOOKS_FUNC(__func__); + return nullptr; + } + + virtual PyObject* getDeviceProperties(DeviceIndex device) const { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) FAIL_MTIAHOOKS_FUNC(__func__); return nullptr; } @@ -116,6 +173,7 @@ struct TORCH_API MTIAHooksInterface : AcceleratorHooksInterface { virtual void recordMemoryHistory( +<<<<<<< HEAD const std::optional& enabled, const std::string& stacks, size_t max_entries) const { @@ -123,6 +181,15 @@ struct TORCH_API MTIAHooksInterface : AcceleratorHooksInterface { } virtual PyObject* memorySnapshot() const { +======= + const std::optional& /*enabled*/, + const std::string& /*stacks*/, + size_t /*max_entries*/) const { + FAIL_MTIAHOOKS_FUNC(__func__); + } + + virtual PyObject* memorySnapshot(const std::optional& local_path) const { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) FAIL_MTIAHOOKS_FUNC(__func__); return nullptr; } @@ -132,10 +199,21 @@ struct TORCH_API MTIAHooksInterface : AcceleratorHooksInterface { return 0; } +<<<<<<< HEAD virtual void resetPeakMemoryStats(DeviceIndex device) const { FAIL_MTIAHOOKS_FUNC(__func__); } +======= + virtual void resetPeakMemoryStats(DeviceIndex /*device*/) const { + FAIL_MTIAHOOKS_FUNC(__func__); + } + + virtual void attachOutOfMemoryObserver(PyObject* observer) const { + FAIL_MTIAHOOKS_FUNC(__func__); + return; + } +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) }; struct TORCH_API MTIAHooksArgs {}; @@ -149,4 +227,7 @@ TORCH_API const MTIAHooksInterface& getMTIAHooks(); TORCH_API bool isMTIAHooksBuilt(); } // namespace detail } // namespace at +<<<<<<< HEAD C10_DIAGNOSTIC_POP() +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) diff --git a/aten/src/ATen/detail/PrivateUse1HooksInterface.h b/aten/src/ATen/detail/PrivateUse1HooksInterface.h index 69819c764260..0dbc878cf821 100644 --- a/aten/src/ATen/detail/PrivateUse1HooksInterface.h +++ b/aten/src/ATen/detail/PrivateUse1HooksInterface.h @@ -23,6 +23,17 @@ struct TORCH_API PrivateUse1HooksInterface : AcceleratorHooksInterface { ~PrivateUse1HooksInterface() override = default; +<<<<<<< HEAD +======= + bool isBuilt() const override { + FAIL_PRIVATEUSE1HOOKS_FUNC(__func__); + } + + bool isAvailable() const override { + FAIL_PRIVATEUSE1HOOKS_FUNC(__func__); + } + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) const at::Generator& getDefaultGenerator( c10::DeviceIndex device_index) const override { FAIL_PRIVATEUSE1HOOKS_FUNC(__func__); diff --git a/aten/src/ATen/functorch/BatchRulesDecompositions.cpp b/aten/src/ATen/functorch/BatchRulesDecompositions.cpp index cca20e9e553e..9bcaccda5840 100644 --- a/aten/src/ATen/functorch/BatchRulesDecompositions.cpp +++ b/aten/src/ATen/functorch/BatchRulesDecompositions.cpp @@ -193,6 +193,10 @@ TORCH_LIBRARY_IMPL(aten, FuncTorchBatchedDecomposition, m) { OP_DECOMPOSE(_lu_with_info); OP_DECOMPOSE(matmul); OP_DECOMPOSE(matrix_H); +<<<<<<< HEAD +======= + OP_DECOMPOSE(matrix_exp); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) OP_DECOMPOSE(matrix_power); OP_DECOMPOSE2(max, other ); OP_DECOMPOSE(max_pool1d); diff --git a/aten/src/ATen/functorch/BatchRulesPooling.cpp b/aten/src/ATen/functorch/BatchRulesPooling.cpp index c6cab4a42d6f..fe998e027513 100644 --- a/aten/src/ATen/functorch/BatchRulesPooling.cpp +++ b/aten/src/ATen/functorch/BatchRulesPooling.cpp @@ -12,7 +12,11 @@ namespace at::functorch { template +<<<<<<< HEAD std::tuple,Tensor, std::optional> +======= +static std::tuple,Tensor, std::optional> +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) max_pool_with_indices_batch_rule_helper( const Tensor& self, std::optional self_bdim, IntArrayRef kernel_size, IntArrayRef stride, diff --git a/aten/src/ATen/functorch/BatchRulesRandomness.cpp b/aten/src/ATen/functorch/BatchRulesRandomness.cpp index b578047dd6fd..a39c8c31d06d 100644 --- a/aten/src/ATen/functorch/BatchRulesRandomness.cpp +++ b/aten/src/ATen/functorch/BatchRulesRandomness.cpp @@ -20,7 +20,11 @@ namespace at::functorch { template +<<<<<<< HEAD Tensor random_batching_rule(SymIntArrayRef shape, ExtraArgs... extra_args) { +======= +static Tensor random_batching_rule(SymIntArrayRef shape, ExtraArgs... extra_args) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) c10::impl::ExcludeDispatchKeyGuard guard(DispatchKey::FuncTorchVmapMode); auto maybe_layer = maybeCurrentDynamicLayer(); TORCH_INTERNAL_ASSERT(maybe_layer.has_value()); @@ -37,7 +41,11 @@ Tensor random_batching_rule(SymIntArrayRef shape, ExtraArgs... extra_args) { } template +<<<<<<< HEAD Tensor& random_inplace_batching_rule(Tensor& self, ExtraArgs... extra_args) { +======= +static Tensor& random_inplace_batching_rule(Tensor& self, ExtraArgs... extra_args) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) c10::impl::ExcludeDispatchKeyGuard guard(DispatchKey::FuncTorchVmapMode); auto maybe_layer = maybeCurrentDynamicLayer(); TORCH_INTERNAL_ASSERT(maybe_layer.has_value()); @@ -108,7 +116,11 @@ static Tensor& bernoulli_inplace_Tensor_batching_rule(Tensor& self, const Tensor } template +<<<<<<< HEAD Tensor randperm_batching_rule(int64_t n, ExtraArgs... extra_args) { +======= +static Tensor randperm_batching_rule(int64_t n, ExtraArgs... extra_args) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) c10::impl::ExcludeDispatchKeyGuard guard(DispatchKey::FuncTorchVmapMode); auto maybe_layer = maybeCurrentDynamicLayer(); auto const batch_size = maybe_layer->batchSize(); @@ -127,7 +139,11 @@ Tensor randperm_batching_rule(int64_t n, ExtraArgs... extra_args) { } template +<<<<<<< HEAD Tensor unary_pointwise_random_batch_rule(const Tensor& tensor, ExtraArgs... extra_args) { +======= +static Tensor unary_pointwise_random_batch_rule(const Tensor& tensor, ExtraArgs... extra_args) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) c10::impl::ExcludeDispatchKeyGuard guard(DispatchKey::FuncTorchVmapMode); auto maybe_layer = maybeCurrentDynamicLayer(); const auto cur_level = maybe_layer->layerId(); @@ -153,7 +169,11 @@ Tensor unary_pointwise_random_batch_rule(const Tensor& tensor, ExtraArgs... extr } template +<<<<<<< HEAD Tensor tensor_like_random_batch_rule(const Tensor& self, ExtraArgs... extra_args) { +======= +static Tensor tensor_like_random_batch_rule(const Tensor& self, ExtraArgs... extra_args) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) c10::impl::ExcludeDispatchKeyGuard guard(DispatchKey::FuncTorchVmapMode); auto maybe_layer = maybeCurrentDynamicLayer(); const auto cur_level = maybe_layer->layerId(); @@ -272,7 +292,11 @@ struct RandomBatchRuleHelper> { }; template +<<<<<<< HEAD Tensor rand_int_wrapper(SymIntArrayRef shape, c10::SymInt high, T... extra_args) { +======= +static Tensor rand_int_wrapper(SymIntArrayRef shape, c10::SymInt high, T... extra_args) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return Func(high, shape, std::forward(extra_args)...); } @@ -299,7 +323,11 @@ struct RandIntBatchRuleHelper> { }; template +<<<<<<< HEAD Tensor rand_int_low_wrapper(SymIntArrayRef shape, T0 scalar0, T1 scalar1, T... extra_args) { +======= +static Tensor rand_int_low_wrapper(SymIntArrayRef shape, T0 scalar0, T1 scalar1, T... extra_args) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return Func(scalar0, scalar1, shape, std::forward(extra_args)...); } @@ -346,7 +374,11 @@ struct NormalPointwiseBatchRule> { }; template +<<<<<<< HEAD Tensor normal_wrapper(const Tensor& tensor, double scalar, T... extra_args) { +======= +static Tensor normal_wrapper(const Tensor& tensor, double scalar, T... extra_args) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return Func(scalar, tensor, extra_args...); } diff --git a/aten/src/ATen/functorch/BatchRulesScatterOps.cpp b/aten/src/ATen/functorch/BatchRulesScatterOps.cpp index a7366eef4fd3..6f22c16bfa78 100644 --- a/aten/src/ATen/functorch/BatchRulesScatterOps.cpp +++ b/aten/src/ATen/functorch/BatchRulesScatterOps.cpp @@ -385,9 +385,17 @@ namespace { // next broadcast all index tensors together try { indices = at::expand_outplace(indices); +<<<<<<< HEAD } catch (std::exception &e) { TORCH_CHECK_INDEX(false, "shape mismatch: indexing tensors could not be broadcast together" " with shapes "); +======= + } catch (std::exception&) { + TORCH_CHECK_INDEX( + false, + "shape mismatch: indexing tensors could not be broadcast together" + " with shapes "); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } // add missing null Tensors so that it matches self.dim() while (indices.size() < static_cast(self.dim())) { @@ -771,6 +779,18 @@ std::tuple> scatter_add_batch_rule( self, self_bdim, dim, index, index_bdim, src, src_bdim); } +<<<<<<< HEAD +======= +std::tuple> scatter_add__batch_rule( + const Tensor& self, std::optional self_bdim, + int64_t dim, + const Tensor& index, std::optional index_bdim, + const Tensor& src, std::optional src_bdim) { + return scatter_batch_rule(ATEN_FN(scatter_add_), + self, self_bdim, dim, index, index_bdim, src, src_bdim); +} + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) std::tuple> scatter_reduce_batch_rule( const Tensor& self, std::optional self_bdim, int64_t dim, @@ -1276,6 +1296,10 @@ TORCH_LIBRARY_IMPL(aten, FuncTorchBatched, m) { VMAP_SUPPORT2(scatter, value, scatter_value_batch_rule); VMAP_SUPPORT2(scatter, src, scatter_src_batch_rule); VMAP_SUPPORT(scatter_add, scatter_add_batch_rule); +<<<<<<< HEAD +======= + VMAP_SUPPORT(scatter_add_, scatter_add__batch_rule); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) VMAP_SUPPORT2(scatter, reduce, scatter_reduce_batch_rule); VMAP_SUPPORT2(scatter, value_reduce, scatter_value_reduce_batch_rule); VMAP_SUPPORT2(scatter_reduce, two, scatter_reduce_two_batch_rule); diff --git a/aten/src/ATen/functorch/BatchedFallback.cpp b/aten/src/ATen/functorch/BatchedFallback.cpp index 55d9e91834d2..d2278e3c5502 100644 --- a/aten/src/ATen/functorch/BatchedFallback.cpp +++ b/aten/src/ATen/functorch/BatchedFallback.cpp @@ -19,7 +19,11 @@ namespace at::functorch { +<<<<<<< HEAD bool kVmapFallbackWarningEnabled = true; +======= +static bool kVmapFallbackWarningEnabled = true; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) bool isVmapFallbackWarningEnabled() { return kVmapFallbackWarningEnabled; @@ -29,7 +33,11 @@ void setVmapFallbackWarningEnabled(bool enabled) { kVmapFallbackWarningEnabled = enabled; } +<<<<<<< HEAD bool kVmapFallbackEnabled = true; +======= +static bool kVmapFallbackEnabled = true; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) bool isVmapFallbackEnabled() { return kVmapFallbackEnabled; diff --git a/aten/src/ATen/functorch/BatchedTensorImpl.h b/aten/src/ATen/functorch/BatchedTensorImpl.h index e42f8dd87b50..62e3ade38937 100644 --- a/aten/src/ATen/functorch/BatchedTensorImpl.h +++ b/aten/src/ATen/functorch/BatchedTensorImpl.h @@ -159,6 +159,10 @@ constexpr DispatchKeySet kKeysToPropagateToWrapper({ DispatchKey::XLA, DispatchKey::CUDA, DispatchKey::CPU, +<<<<<<< HEAD +======= + DispatchKey::PrivateUse1, +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) }); inline DispatchKeySet getKeysToPropagateToWrapper(const Tensor& tensor, DispatchKeySet to_propagate=kKeysToPropagateToWrapper) { diff --git a/aten/src/ATen/functorch/Interpreter.h b/aten/src/ATen/functorch/Interpreter.h index bdea11d3b2a0..84edfb40e1db 100644 --- a/aten/src/ATen/functorch/Interpreter.h +++ b/aten/src/ATen/functorch/Interpreter.h @@ -8,6 +8,11 @@ #include #include +<<<<<<< HEAD +======= +#include + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) namespace at::functorch { // NOTE: [functorch interpreter stack] @@ -91,24 +96,112 @@ std::ostream& operator<<(std::ostream& os, const TransformType& t); struct VmapInterpreterMeta { explicit VmapInterpreterMeta(c10::SymInt batchSize, RandomnessType randomness) : batchSize_(std::move(batchSize)), randomness_(randomness) {} +<<<<<<< HEAD + c10::SymInt batchSize_; + RandomnessType randomness_; +======= + c10::SymInt batchSize_; RandomnessType randomness_; + + VmapInterpreterMeta() = default; + VmapInterpreterMeta(const VmapInterpreterMeta&) = default; + VmapInterpreterMeta(VmapInterpreterMeta&&) = default; + VmapInterpreterMeta& operator=(const VmapInterpreterMeta&) = default; + VmapInterpreterMeta& operator=(VmapInterpreterMeta&&) = default; + ~VmapInterpreterMeta() = default; + + template + friend void to_json(T& json_j, const VmapInterpreterMeta& json_t) { + if (json_t.batchSize_.is_heap_allocated()) { + throw std::runtime_error("Serialization for heap-allocated SymInt is not implemented yet"); + } + json_j["batchSize"] = json_t.batchSize_.as_int_unchecked(); + json_j["randomness"] = static_cast(json_t.randomness_); + } + + template + friend void from_json(const T& json_j, VmapInterpreterMeta& json_t) { + json_t.batchSize_ = c10::SymInt(SymInt::Unchecked::UNCHECKED, json_j["batchSize"]); + json_t.randomness_ = static_cast(json_j["randomness"]); + } +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) }; struct GradInterpreterMeta { explicit GradInterpreterMeta(bool prevGradMode): prevGradMode_(prevGradMode) {} +<<<<<<< HEAD + bool prevGradMode_; +======= + GradInterpreterMeta() = default; + GradInterpreterMeta(const GradInterpreterMeta&) = default; + GradInterpreterMeta(GradInterpreterMeta&&) = default; + GradInterpreterMeta& operator=(const GradInterpreterMeta&) = default; + GradInterpreterMeta& operator=(GradInterpreterMeta&&) = default; + ~GradInterpreterMeta() = default; + bool prevGradMode_; + template + friend void to_json(T& json_j, const GradInterpreterMeta& json_t) { + json_j["prevGradMode"] = json_t.prevGradMode_; + } + + template + friend void from_json(const T& json_j, GradInterpreterMeta& json_t) { + json_t.prevGradMode_ = json_j["prevGradMode"]; + } +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) }; struct JvpInterpreterMeta { explicit JvpInterpreterMeta(bool prevFwdGradMode) : prevFwdGradMode_(prevFwdGradMode) {} +<<<<<<< HEAD + bool prevFwdGradMode_; +======= + JvpInterpreterMeta() = default; + JvpInterpreterMeta(const JvpInterpreterMeta&) = default; + JvpInterpreterMeta(JvpInterpreterMeta&&) = default; + JvpInterpreterMeta& operator=(const JvpInterpreterMeta&) = default; + JvpInterpreterMeta& operator=(JvpInterpreterMeta&&) = default; + ~JvpInterpreterMeta() = default; + bool prevFwdGradMode_; + template + friend void to_json(T& json_j, const JvpInterpreterMeta& json_t) { + json_j["prevFwdGradMode"] = json_t.prevFwdGradMode_; + } + + template + friend void from_json(const T& json_j, JvpInterpreterMeta& json_t) { + json_t.prevFwdGradMode_ = json_j["prevFwdGradMode"]; + } +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) }; struct FunctionalizeInterpreterMeta { explicit FunctionalizeInterpreterMeta(bool functionalizeAddBackViews) : functionalizeAddBackViews_(functionalizeAddBackViews) {} +<<<<<<< HEAD + bool functionalizeAddBackViews_; +======= + FunctionalizeInterpreterMeta() = default; + FunctionalizeInterpreterMeta(const FunctionalizeInterpreterMeta&) = default; + FunctionalizeInterpreterMeta(FunctionalizeInterpreterMeta&&) = default; + FunctionalizeInterpreterMeta& operator=(const FunctionalizeInterpreterMeta&) = default; + FunctionalizeInterpreterMeta& operator=(FunctionalizeInterpreterMeta&&) = default; + ~FunctionalizeInterpreterMeta() = default; + bool functionalizeAddBackViews_; + template + friend void to_json(T& json_j, const FunctionalizeInterpreterMeta& json_t) { + json_j["functionalizeAddBackViews"] = json_t.functionalizeAddBackViews_; + } + + template + friend void from_json(const T& json_j, FunctionalizeInterpreterMeta& json_t) { + json_t.functionalizeAddBackViews_ = json_j["functionalizeAddBackViews"]; + } +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) }; typedef std::variant< @@ -172,6 +265,78 @@ struct Interpreter { // Please don't use this explicit Interpreter() = default; +<<<<<<< HEAD +======= + template + friend void to_json(T& json_j, const Interpreter& json_t) { + json_j["type"] = static_cast(json_t.type_); + json_j["level"] = json_t.level_; + if (json_t.savedLocalDispatchKeySet_) { + json_j["savedLocalDispatchKeySet"] = { + {"included", json_t.savedLocalDispatchKeySet_->included_.raw_repr()}, + {"excluded", json_t.savedLocalDispatchKeySet_->excluded_.raw_repr()} + }; + } else { + json_j["savedLocalDispatchKeySet"] = nlohmann::json(); + } + json_j["is_alive"] = *json_t.is_alive_; + std::visit([&](auto&& arg) { + using V = std::decay_t; + if constexpr (std::is_same_v) { + json_j["meta"] = {{"Torch", arg}}; + } else if constexpr (std::is_same_v) { + json_j["meta"] = {{"Grad", arg}}; + } else if constexpr (std::is_same_v) { + json_j["meta"] = {{"Jvp", arg}}; + } else if constexpr (std::is_same_v) { + json_j["meta"] = {{"Vmap", arg}}; + } else if constexpr (std::is_same_v) { + json_j["meta"] = {{"Functionalize", arg}}; + } else { + static_assert(false && sizeof(V), "unknown variant case"); + } + }, json_t.meta_); + } + + template + friend void from_json(const T& json_j, Interpreter& json_t) { + json_t.type_ = static_cast(json_j["type"]); + json_t.level_ = json_j["level"]; + auto savedLocalDispatchKeySet = json_j["savedLocalDispatchKeySet"]; + if (savedLocalDispatchKeySet.is_null()) { + json_t.savedLocalDispatchKeySet_ = std::nullopt; + } else { + c10::impl::PODLocalDispatchKeySet pod; + pod.set_included(DispatchKeySet::from_raw_repr(savedLocalDispatchKeySet["included"].template get())); + pod.set_excluded(DispatchKeySet::from_raw_repr(savedLocalDispatchKeySet["excluded"].template get())); + json_t.savedLocalDispatchKeySet_ = c10::impl::LocalDispatchKeySet(pod); + } + json_t.is_alive_ = std::make_shared(json_j["is_alive"]); + auto meta = json_j["meta"]; + if (meta.contains("Torch")) { + json_t.meta_.emplace(meta["Torch"].template get()); + } else if (meta.contains("Grad")) { + json_t.meta_.emplace(meta["Grad"].template get()); + } else if (meta.contains("Jvp")) { + json_t.meta_.emplace(meta["Jvp"].template get()); + } else if (meta.contains("Vmap")) { + json_t.meta_.emplace(meta["Vmap"].template get()); + } else if (meta.contains("Functionalize")) { + json_t.meta_.emplace(meta["Functionalize"].template get()); + } else { + throw std::runtime_error("unknown interpreter metadata type"); + } + } + + std::string serialize() const { + return nlohmann::json(*this).dump(); + } + + static Interpreter deserialize(const std::string& serialized) { + return nlohmann::json::parse(serialized).get(); + } + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) private: explicit Interpreter(TransformType type, int64_t level, InterpreterMeta meta): type_(type), level_(level), is_alive_(std::make_shared(false)), meta_(std::move(meta)) {} diff --git a/aten/src/ATen/functorch/PyTorchOperatorHacks.cpp b/aten/src/ATen/functorch/PyTorchOperatorHacks.cpp index 7bc3a3cbfe44..f9aad2862469 100644 --- a/aten/src/ATen/functorch/PyTorchOperatorHacks.cpp +++ b/aten/src/ATen/functorch/PyTorchOperatorHacks.cpp @@ -143,7 +143,11 @@ static Tensor make_feature_noise(const Tensor& input) { } static bool is_fused_kernel_acceptable(const Tensor& input, double p) { +<<<<<<< HEAD return (input.is_cuda() || input.is_xpu() || input.is_lazy()) && p > 0 && p < 1 && input.numel() > 0; +======= + return (input.is_cuda() || input.is_xpu() || input.is_lazy() || input.is_privateuseone()) && p > 0 && p < 1 && input.numel() > 0; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } // NB: sure, we could have used different overloads here, but I would feel insecure diff --git a/aten/src/ATen/functorch/TensorWrapper.cpp b/aten/src/ATen/functorch/TensorWrapper.cpp index 4f50a1fe2b40..5c39ddb4ddd8 100644 --- a/aten/src/ATen/functorch/TensorWrapper.cpp +++ b/aten/src/ATen/functorch/TensorWrapper.cpp @@ -56,7 +56,12 @@ void dumpTensorCout(const Tensor& tensor) { static c10::intrusive_ptr makeTensorWrapperPtr(const Tensor& tensor, int64_t level, const std::shared_ptr& life_handle) { auto keys_to_propagate = kKeysToPropagateToWrapper | DispatchKeySet({ +<<<<<<< HEAD DispatchKey::AutogradCPU, DispatchKey::AutogradCUDA, DispatchKey::AutogradXLA}); +======= + DispatchKey::AutogradCPU, DispatchKey::AutogradCUDA, DispatchKey::AutogradXLA, + DispatchKey::AutogradPrivateUse1}); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) auto key_set = getKeysToPropagateToWrapper(tensor, keys_to_propagate); key_set = key_set.add(DispatchKey::FuncTorchGradWrapper); return c10::make_intrusive(key_set, tensor, level, life_handle); @@ -76,7 +81,12 @@ static Tensor unsafeMakeTensorWrapper( } auto keys_to_propagate = kKeysToPropagateToWrapper | DispatchKeySet({ +<<<<<<< HEAD DispatchKey::AutogradCPU, DispatchKey::AutogradCUDA, DispatchKey::AutogradXLA}); +======= + DispatchKey::AutogradCPU, DispatchKey::AutogradCUDA, DispatchKey::AutogradXLA, + DispatchKey::AutogradPrivateUse1}); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) auto key_set = getKeysToPropagateToWrapper(tensor, keys_to_propagate); key_set = key_set.add(DispatchKey::FuncTorchGradWrapper); auto result = at::detail::make_tensor( diff --git a/aten/src/ATen/hip/impl/HIPAllocatorMasqueradingAsCUDA.h b/aten/src/ATen/hip/impl/HIPAllocatorMasqueradingAsCUDA.h index 9e714101d5a9..fff700505191 100644 --- a/aten/src/ATen/hip/impl/HIPAllocatorMasqueradingAsCUDA.h +++ b/aten/src/ATen/hip/impl/HIPAllocatorMasqueradingAsCUDA.h @@ -67,8 +67,13 @@ class HIPAllocatorMasqueradingAsCUDA final : public HIPCachingAllocator::HIPAllo allocator_->setMemoryFraction(fraction, device); } +<<<<<<< HEAD void emptyCache() override { allocator_->emptyCache(); +======= + void emptyCache(MempoolId_t mempool_id = {0, 0}) override { + allocator_->emptyCache(mempool_id); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } void enable(bool value) override { @@ -103,8 +108,13 @@ class HIPAllocatorMasqueradingAsCUDA final : public HIPCachingAllocator::HIPAllo allocator_->resetPeakStats(device); } +<<<<<<< HEAD HIPCachingAllocator::SnapshotInfo snapshot() override { return allocator_->snapshot(); +======= + HIPCachingAllocator::SnapshotInfo snapshot(MempoolId_t mempool_id = {0, 0}) override { + return allocator_->snapshot(mempool_id); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } void beginAllocateToPool( @@ -128,10 +138,22 @@ class HIPAllocatorMasqueradingAsCUDA final : public HIPCachingAllocator::HIPAllo return allocator_->getPoolUseCount(device, mempool_id); } +<<<<<<< HEAD void ensureExistsAndIncrefPool( c10::DeviceIndex device, MempoolId_t mempool_id) override { allocator_->ensureExistsAndIncrefPool(device, mempool_id); +======= + void createOrIncrefPool( + c10::DeviceIndex device, + MempoolId_t mempool_id, + HIPAllocator* allocator = nullptr) override { + allocator_->createOrIncrefPool(device, mempool_id, allocator); + } + + void setUseOnOOM(c10::DeviceIndex device, MempoolId_t mempool_id) override { + allocator_->setUseOnOOM(device, mempool_id); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } bool checkPoolLiveAllocations( @@ -157,8 +179,14 @@ class HIPAllocatorMasqueradingAsCUDA final : public HIPCachingAllocator::HIPAllo bool enabled, HIPCachingAllocator::CreateContextFn context_recorder, size_t alloc_trace_max_entries, +<<<<<<< HEAD HIPCachingAllocator::RecordContext when) override { allocator_->recordHistory(enabled, context_recorder, alloc_trace_max_entries, when); +======= + HIPCachingAllocator::RecordContext when, + bool clearHistory) override { + allocator_->recordHistory(enabled, context_recorder, alloc_trace_max_entries, when, clearHistory); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } void recordAnnotation( @@ -166,6 +194,17 @@ class HIPAllocatorMasqueradingAsCUDA final : public HIPCachingAllocator::HIPAllo allocator_->recordAnnotation(md); } +<<<<<<< HEAD +======= + void pushCompileContext(std::string& md) override { + allocator_->pushCompileContext(md); + } + + void popCompileContext() override { + allocator_->popCompileContext(); + } + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) void attachOutOfMemoryObserver(HIPCachingAllocator::OutOfMemoryObserver observer) override { allocator_->attachOutOfMemoryObserver(observer); } diff --git a/aten/src/ATen/hip/impl/HIPCachingAllocatorMasqueradingAsCUDA.h b/aten/src/ATen/hip/impl/HIPCachingAllocatorMasqueradingAsCUDA.h index 0edd69f08a91..d3ef24c3351f 100644 --- a/aten/src/ATen/hip/impl/HIPCachingAllocatorMasqueradingAsCUDA.h +++ b/aten/src/ATen/hip/impl/HIPCachingAllocatorMasqueradingAsCUDA.h @@ -37,8 +37,13 @@ inline void setMemoryFraction(double fraction, c10::DeviceIndex device) { return get()->setMemoryFraction(fraction, device); } +<<<<<<< HEAD inline void emptyCache() { return get()->emptyCache(); +======= +inline void emptyCache(MempoolId_t mempool_id = {0, 0}) { + return get()->emptyCache(mempool_id); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } inline void enable(bool value) { @@ -70,8 +75,13 @@ inline void resetPeakStats(c10::DeviceIndex device) { return get()->resetPeakStats(device); } +<<<<<<< HEAD inline HIPCachingAllocator::SnapshotInfo snapshot() { return get()->snapshot(); +======= +inline HIPCachingAllocator::SnapshotInfo snapshot(MempoolId_t mempool_id = {0, 0}) { + return get()->snapshot(mempool_id); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } inline std::shared_ptr getCheckpointState( @@ -101,9 +111,16 @@ inline void recordHistory( bool enabled, HIPCachingAllocator::CreateContextFn context_recorder, size_t alloc_trace_max_entries, +<<<<<<< HEAD HIPCachingAllocator::RecordContext when) { return get()->recordHistory( enabled, context_recorder, alloc_trace_max_entries, when); +======= + HIPCachingAllocator::RecordContext when, + bool clearHistory) { + return get()->recordHistory( + enabled, context_recorder, alloc_trace_max_entries, when, clearHistory); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } inline void recordAnnotation( @@ -111,6 +128,17 @@ inline void recordAnnotation( return get()->recordAnnotation(md); } +<<<<<<< HEAD +======= +inline void pushCompileContext(std::string& md) { + return get()->pushCompileContext(md); +} + +inline void popCompileContext() { + return get()->popCompileContext(); +} + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) inline bool isHistoryEnabled() { return get()->isHistoryEnabled(); } @@ -135,10 +163,22 @@ inline void releasePool(c10::DeviceIndex device, MempoolId_t mempool_id) { return get()->releasePool(device, mempool_id); } +<<<<<<< HEAD inline void ensureExistsAndIncrefPool( c10::DeviceIndex device, MempoolId_t mempool_id) { get()->ensureExistsAndIncrefPool(device, mempool_id); +======= +inline void createOrIncrefPool( + c10::DeviceIndex device, + MempoolId_t mempool_id, + HIPCachingAllocator::HIPAllocator* allocator_ptr = nullptr) { + get()->createOrIncrefPool(device, mempool_id, allocator_ptr); +} + +inline void setUseOnOOM(c10::DeviceIndex device, MempoolId_t mempool_id) { + get()->setUseOnOOM(device, mempool_id); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } inline int getPoolUseCount(c10::DeviceIndex device, MempoolId_t mempool_id) { diff --git a/aten/src/ATen/miopen/Descriptors.h b/aten/src/ATen/miopen/Descriptors.h index fae1befb5d15..97492565ceab 100644 --- a/aten/src/ATen/miopen/Descriptors.h +++ b/aten/src/ATen/miopen/Descriptors.h @@ -67,7 +67,11 @@ struct DescriptorDeleter { // function. template // NOLINTNEXTLINE(bugprone-exception-escape) +<<<<<<< HEAD class TORCH_CUDA_CPP_API Descriptor { +======= +class TORCH_HIP_CPP_API Descriptor { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) public: // Use desc() to access the underlying descriptor pointer in // a read-only fashion. Most client code should use this. @@ -93,7 +97,11 @@ class TORCH_CUDA_CPP_API Descriptor { std::unique_ptr> desc_; }; +<<<<<<< HEAD class TORCH_CUDA_CPP_API TensorDescriptor : public Descriptor< +======= +class TORCH_HIP_CPP_API TensorDescriptor : public Descriptor< +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) miopenTensorDescriptor, &miopenCreateTensorDescriptor, &miopenDestroyTensorDescriptor> { @@ -122,7 +130,11 @@ class TORCH_CUDA_CPP_API TensorDescriptor : public Descriptor< std::ostream& operator<<(std::ostream & out, const TensorDescriptor& d); +<<<<<<< HEAD class TORCH_CUDA_CPP_API FilterDescriptor : public Descriptor< +======= +class TORCH_HIP_CPP_API FilterDescriptor : public Descriptor< +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) miopenTensorDescriptor, &miopenCreateTensorDescriptor, &miopenDestroyTensorDescriptor> { @@ -141,7 +153,11 @@ class TORCH_CUDA_CPP_API FilterDescriptor : public Descriptor< } }; +<<<<<<< HEAD struct TORCH_CUDA_CPP_API ConvolutionDescriptor +======= +struct TORCH_HIP_CPP_API ConvolutionDescriptor +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) : public Descriptor< miopenConvolutionDescriptor, &miopenCreateConvolutionDescriptor, @@ -156,11 +172,20 @@ struct TORCH_CUDA_CPP_API ConvolutionDescriptor } }; +<<<<<<< HEAD struct DropoutDescriptor : public Descriptor { +======= +// NOLINTNEXTLINE(bugprone-exception-escape) +struct TORCH_HIP_CPP_API DropoutDescriptor + : public Descriptor< + miopenDropoutDescriptor, + &miopenCreateDropoutDescriptor, + &miopenDestroyDropoutDescriptor> { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) void set(miopenHandle_t handle, float dropout, void* states, size_t stateSizeInBytes, unsigned long long seed, bool use_mask, bool state_evo, miopenRNGType_t rng_mode) { MIOPEN_CHECK(miopenSetDropoutDescriptor(mut_desc(), handle, dropout, states, stateSizeInBytes, seed, use_mask, state_evo, rng_mode)); @@ -172,7 +197,11 @@ struct DropoutDescriptor } }; +<<<<<<< HEAD struct TORCH_CUDA_CPP_API RNNDescriptor +======= +struct TORCH_HIP_CPP_API RNNDescriptor +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) : public Descriptor diff --git a/aten/src/ATen/miopen/Handle.h b/aten/src/ATen/miopen/Handle.h index 2ec059c6f5f5..af852c2ae672 100644 --- a/aten/src/ATen/miopen/Handle.h +++ b/aten/src/ATen/miopen/Handle.h @@ -5,6 +5,10 @@ namespace at::native { +<<<<<<< HEAD TORCH_CUDA_CPP_API miopenHandle_t getMiopenHandle(); +======= +TORCH_HIP_CPP_API miopenHandle_t getMiopenHandle(); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } // namespace at::native diff --git a/aten/src/ATen/miopen/Types.h b/aten/src/ATen/miopen/Types.h index 0a8a1a952e2e..75dfa3021060 100644 --- a/aten/src/ATen/miopen/Types.h +++ b/aten/src/ATen/miopen/Types.h @@ -6,7 +6,11 @@ namespace at::native { +<<<<<<< HEAD TORCH_CUDA_CPP_API miopenDataType_t getMiopenDataType(const at::Tensor& tensor); +======= +TORCH_HIP_CPP_API miopenDataType_t getMiopenDataType(const at::Tensor& tensor); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) int64_t miopen_version(); diff --git a/aten/src/ATen/mkl/Descriptors.h b/aten/src/ATen/mkl/Descriptors.h index 4a006639a7f7..0198bbea401e 100644 --- a/aten/src/ATen/mkl/Descriptors.h +++ b/aten/src/ATen/mkl/Descriptors.h @@ -17,9 +17,14 @@ struct DftiDescriptorDeleter { class DftiDescriptor { public: void init(DFTI_CONFIG_VALUE precision, DFTI_CONFIG_VALUE signal_type, MKL_LONG signal_ndim, MKL_LONG* sizes) { +<<<<<<< HEAD if (desc_ != nullptr) { throw std::runtime_error("DFTI DESCRIPTOR can only be initialized once"); } +======= + TORCH_CHECK( + desc_ == nullptr, "DFTI DESCRIPTOR can only be initialized once"); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) DFTI_DESCRIPTOR *raw_desc; if (signal_ndim == 1) { MKL_DFTI_CHECK(DftiCreateDescriptor(&raw_desc, precision, signal_type, 1, sizes[0])); @@ -30,9 +35,14 @@ class DftiDescriptor { } DFTI_DESCRIPTOR *get() const { +<<<<<<< HEAD if (desc_ == nullptr) { throw std::runtime_error("DFTI DESCRIPTOR has not been initialized"); } +======= + TORCH_CHECK( + desc_ != nullptr, "DFTI DESCRIPTOR has not been initialized"); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return desc_.get(); } diff --git a/aten/src/ATen/mkl/README.md b/aten/src/ATen/mkl/README.md index ee10392bdccd..919e59fa9945 100644 --- a/aten/src/ATen/mkl/README.md +++ b/aten/src/ATen/mkl/README.md @@ -1,4 +1,8 @@ All files living in this directory are written with the assumption that MKL is available, which means that these code are not guarded by `#if AT_MKL_ENABLED()`. Therefore, whenever you need to use definitions from here, please guard the `#include` and +<<<<<<< HEAD definition usages with `#if AT_MKL_ENABLED()` macro, e.g. [SpectralOps.cpp](native/mkl/SpectralOps.cpp). +======= +definition usages with `#if AT_MKL_ENABLED()` macro, e.g. [SpectralOps.cpp](../native/mkl/SpectralOps.cpp). +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) diff --git a/aten/src/ATen/mps/EmptyTensor.cpp b/aten/src/ATen/mps/EmptyTensor.cpp index e6a292ba2a55..e320260b1918 100644 --- a/aten/src/ATen/mps/EmptyTensor.cpp +++ b/aten/src/ATen/mps/EmptyTensor.cpp @@ -1,5 +1,9 @@ // Copyright © 2022 Apple Inc. +<<<<<<< HEAD +======= +#include +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #include #include #include @@ -35,7 +39,11 @@ TensorBase empty_mps( layout_or_default(layout_opt) == Layout::Strided, "only strided tensors are supported on MPS"); +<<<<<<< HEAD TORCH_CHECK(size.size() <= 16, "MPS supports tensors with dimensions <= 16, but got ", size.size(), "."); +======= + TORCH_CHECK(size.size() <= c10::metal::max_ndim, "MPS supports tensors with dimensions <= ", c10::metal::max_ndim, ", but got ", size.size(), "."); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) check_size_nonnegative(size); diff --git a/aten/src/ATen/mps/MPSAllocator.h b/aten/src/ATen/mps/MPSAllocator.h index be17e364d58b..de80c5fc3560 100644 --- a/aten/src/ATen/mps/MPSAllocator.h +++ b/aten/src/ATen/mps/MPSAllocator.h @@ -345,7 +345,10 @@ class MPSHeapAllocatorImpl { return m_device; } +<<<<<<< HEAD // TODO: make a common function to do size unit conversions in PyTorch. +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) inline std::string format_size(uint64_t size) const; private: diff --git a/aten/src/ATen/mps/MPSAllocator.mm b/aten/src/ATen/mps/MPSAllocator.mm index cf0ebc869bb4..e4ec69a949bc 100644 --- a/aten/src/ATen/mps/MPSAllocator.mm +++ b/aten/src/ATen/mps/MPSAllocator.mm @@ -5,6 +5,10 @@ #include #include #include +<<<<<<< HEAD +======= +#include +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #include @@ -21,19 +25,34 @@ init_buffer_pools(); // debug verbosity flags (see DebugVerbosity enum) +<<<<<<< HEAD static const char* verbosity_str = getenv("PYTORCH_DEBUG_MPS_ALLOCATOR"); m_debug_verbosity = verbosity_str ? strtol(verbosity_str, nullptr, 0) : DebugVerbosity::SILENT; static const char* high_watermark_ratio_str = getenv("PYTORCH_MPS_HIGH_WATERMARK_RATIO"); const double high_watermark_ratio = high_watermark_ratio_str ? strtod(high_watermark_ratio_str, nullptr) : default_high_watermark_ratio; +======= + static const auto verbosity_str = c10::utils::get_env("PYTORCH_DEBUG_MPS_ALLOCATOR"); + m_debug_verbosity = verbosity_str ? strtol(verbosity_str->c_str(), nullptr, 0) : DebugVerbosity::SILENT; + + static const auto high_watermark_ratio_str = c10::utils::get_env("PYTORCH_MPS_HIGH_WATERMARK_RATIO"); + const double high_watermark_ratio = + high_watermark_ratio_str ? strtod(high_watermark_ratio_str->c_str(), nullptr) : default_high_watermark_ratio; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) setHighWatermarkRatio(high_watermark_ratio); const double default_low_watermark_ratio = m_device.hasUnifiedMemory ? default_low_watermark_ratio_unified : default_low_watermark_ratio_discrete; +<<<<<<< HEAD static const char* low_watermark_ratio_str = getenv("PYTORCH_MPS_LOW_WATERMARK_RATIO"); const double low_watermark_ratio = low_watermark_ratio_str ? strtod(low_watermark_ratio_str, nullptr) : default_low_watermark_ratio; +======= + static const auto low_watermark_ratio_str = c10::utils::get_env("PYTORCH_MPS_LOW_WATERMARK_RATIO"); + const double low_watermark_ratio = + low_watermark_ratio_str ? strtod(low_watermark_ratio_str->c_str(), nullptr) : default_low_watermark_ratio; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) setLowWatermarkRatio(low_watermark_ratio); } @@ -638,7 +657,11 @@ std::lock_guard lock(m_mutex); BufferBlock* buffer_block = get_allocated_buffer_block(ptr); +<<<<<<< HEAD if (buffer_block && buffer_block->shape.size() > 0) { +======= + if (buffer_block && !buffer_block->shape.empty()) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return IntArrayRef{buffer_block->shape}; } return IntArrayRef(); @@ -699,6 +722,7 @@ } inline std::string MPSHeapAllocatorImpl::format_size(uint64_t size) const { +<<<<<<< HEAD std::ostringstream os; os.precision(2); os << std::fixed; @@ -712,6 +736,9 @@ os << ((float)size / 1073741824.0) << " GB"; } return os.str(); +======= + return c10::CachingAllocator::format_size(size); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } } // namespace HeapAllocator diff --git a/aten/src/ATen/mps/MPSDevice.h b/aten/src/ATen/mps/MPSDevice.h index 03637e7ca65f..ba1be6a97414 100644 --- a/aten/src/ATen/mps/MPSDevice.h +++ b/aten/src/ATen/mps/MPSDevice.h @@ -1,6 +1,10 @@ // Copyright © 2022 Apple Inc. #pragma once +<<<<<<< HEAD +======= +#include +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #include #include #include @@ -70,4 +74,11 @@ TORCH_API bool is_available(); TORCH_API bool is_macos_13_or_newer(MacOSVersion version); TORCH_API at::Allocator* GetMPSAllocator(bool useSharedAllocator = false); +<<<<<<< HEAD +======= +inline Device getDeviceFromPtr(void* ptr) { + return {c10::DeviceType::MPS, 0}; +} + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } // namespace at::mps diff --git a/aten/src/ATen/mps/MPSFallback.mm b/aten/src/ATen/mps/MPSFallback.mm index a86c0f5feaa3..61f5caa5929a 100644 --- a/aten/src/ATen/mps/MPSFallback.mm +++ b/aten/src/ATen/mps/MPSFallback.mm @@ -2,6 +2,10 @@ #include #include +<<<<<<< HEAD +======= +#include +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #include namespace at { @@ -76,8 +80,13 @@ static Tensor slow_conv2d_forward_mps(const Tensor& self, } TORCH_LIBRARY_IMPL(_, MPS, m) { +<<<<<<< HEAD static const char* enable_mps_fallback = getenv("PYTORCH_ENABLE_MPS_FALLBACK"); if (!enable_mps_fallback || std::stoi(enable_mps_fallback) == 0) { +======= + static const auto enable_mps_fallback = c10::utils::get_env("PYTORCH_ENABLE_MPS_FALLBACK"); + if (!enable_mps_fallback || enable_mps_fallback == "0") { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) m.fallback(torch::CppFunction::makeFromBoxedFunction<&mps_error_fallback>()); } else { m.fallback(torch::CppFunction::makeFromBoxedFunction<&mps_fallback>()); @@ -91,9 +100,13 @@ static Tensor slow_conv2d_forward_mps(const Tensor& self, m.impl("embedding_renorm_", torch::CppFunction::makeFromBoxedFunction<&mps_fallback>()); m.impl("linalg_svd", torch::CppFunction::makeFromBoxedFunction<&mps_fallback>()); m.impl("linalg_svd.U", torch::CppFunction::makeFromBoxedFunction<&mps_fallback>()); +<<<<<<< HEAD m.impl("col2im", torch::CppFunction::makeFromBoxedFunction<&mps_fallback>()); m.impl("_slow_conv2d_forward", slow_conv2d_forward_mps); m.impl("upsample_nearest3d.vec", torch::CppFunction::makeFromBoxedFunction<&mps_fallback>()); +======= + m.impl("_slow_conv2d_forward", slow_conv2d_forward_mps); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } } // namespace at diff --git a/aten/src/ATen/mps/MPSGuardImpl.h b/aten/src/ATen/mps/MPSGuardImpl.h index 7ff2d13ceefa..04dc816a8b7a 100644 --- a/aten/src/ATen/mps/MPSGuardImpl.h +++ b/aten/src/ATen/mps/MPSGuardImpl.h @@ -36,7 +36,14 @@ struct TORCH_API MPSGuardImpl final // constructor MPSGuardImpl() {} explicit MPSGuardImpl(c10::DeviceType t) { +<<<<<<< HEAD TORCH_INTERNAL_ASSERT(t == c10::DeviceType::MPS); +======= + TORCH_CHECK( + t == DeviceType::MPS, + "MPSGuardImpl initialized with non-MPS DeviceType: ", + t); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } // returns the type @@ -57,7 +64,11 @@ struct TORCH_API MPSGuardImpl final } void setDevice(Device d) const override { +<<<<<<< HEAD TORCH_INTERNAL_ASSERT(d.is_mps()); +======= + TORCH_CHECK(d.is_mps(), "Expected a MPS device, but got ", d); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } void uncheckedSetDevice(Device d) const noexcept override { diff --git a/aten/src/ATen/mps/MPSHooks.h b/aten/src/ATen/mps/MPSHooks.h index 17a3d3a68cec..3345b119849e 100644 --- a/aten/src/ATen/mps/MPSHooks.h +++ b/aten/src/ATen/mps/MPSHooks.h @@ -18,6 +18,11 @@ struct MPSHooks : public at::MPSHooksInterface { bool hasMPS() const override; bool isOnMacOSorNewer(unsigned major, unsigned minor) const override; +<<<<<<< HEAD +======= + Device getDeviceFromPtr(void* data) const override; + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // MPSGeneratorImpl interface const Generator& getDefaultGenerator( DeviceIndex device_index = -1) const override; diff --git a/aten/src/ATen/mps/MPSHooks.mm b/aten/src/ATen/mps/MPSHooks.mm index 03c39c957368..9ec7f8b4d468 100644 --- a/aten/src/ATen/mps/MPSHooks.mm +++ b/aten/src/ATen/mps/MPSHooks.mm @@ -129,6 +129,13 @@ at::mps::getMPSEventPool()->recordEvent(event_id, /* syncEvent*/ true); } +<<<<<<< HEAD +======= +Device MPSHooks::getDeviceFromPtr(void* data) const { + return at::mps::getDeviceFromPtr(data); +} + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) void MPSHooks::waitForEvent(uint32_t event_id) const { at::mps::getMPSEventPool()->waitForEvent(event_id, /* syncEvent*/ true); } diff --git a/aten/src/ATen/mps/MPSProfiler.mm b/aten/src/ATen/mps/MPSProfiler.mm index 6adce7d382a6..3037098ff119 100644 --- a/aten/src/ATen/mps/MPSProfiler.mm +++ b/aten/src/ATen/mps/MPSProfiler.mm @@ -2,6 +2,10 @@ #include #include +<<<<<<< HEAD +======= +#include +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #include // these need to be literal strings when passed to os_signpost*() @@ -91,11 +95,19 @@ MPSProfiler::MPSProfiler() : m_os_log_events(nullptr), m_os_log_intervals(nullptr) { // see enum LogOptions for the description. +<<<<<<< HEAD static const char* log_options_str = getenv(kEVLogProfileInfoStr); m_log_options = log_options_str ? strtol(log_options_str, nullptr, 0) : 0; // see enums profilerOptions and SignpostTypes for the description. static const char* trace_signpost_str = getenv(kEVTraceSignpostsStr); uint32_t trace_signposts = trace_signpost_str ? strtol(trace_signpost_str, nullptr, 0) : 0; +======= + static const auto log_options_str = c10::utils::get_env(kEVLogProfileInfoStr); + m_log_options = log_options_str ? strtol(log_options_str->c_str(), nullptr, 0) : 0; + // see enums profilerOptions and SignpostTypes for the description. + static const auto trace_signpost_str = c10::utils::get_env(kEVTraceSignpostsStr); + uint32_t trace_signposts = trace_signpost_str ? strtol(trace_signpost_str->c_str(), nullptr, 0) : 0; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) TORCH_CHECK(m_log_options <= LogOptions::LOG_COUNT, "invalid log options ", @@ -779,8 +791,13 @@ } // used to capture sigint signal to log profiling stats +<<<<<<< HEAD struct sigaction MPSProfiler::currentSigint {}; struct sigaction MPSProfiler::previousSigint {}; +======= +struct sigaction MPSProfiler::currentSigint{}; +struct sigaction MPSProfiler::previousSigint{}; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) bool MPSProfiler::isCapturing() const { return [captureManager isCapturing]; diff --git a/aten/src/ATen/native/AdaptiveAveragePooling.cpp b/aten/src/ATen/native/AdaptiveAveragePooling.cpp index 8b1182982002..d8043b4664a9 100644 --- a/aten/src/ATen/native/AdaptiveAveragePooling.cpp +++ b/aten/src/ATen/native/AdaptiveAveragePooling.cpp @@ -63,6 +63,7 @@ namespace { const Tensor& grad_output, const Tensor& input) { +<<<<<<< HEAD int64_t ndim = grad_output.ndimension(); for (const auto i : c10::irange(1, ndim)) { TORCH_CHECK(grad_output.size(i) > 0, @@ -77,6 +78,18 @@ namespace { "expected dtype ", input.dtype(), " for `grad_output` but got dtype ", grad_output.dtype()); TORCH_CHECK(input.dtype() == grad_input.dtype(), "expected dtype ", input.dtype(), " for `grad_input` but got dtype ", grad_input.dtype()); +======= + adaptive_pool_empty_output_check(grad_output, "adaptive_avg_pool2d_backward"); + int64_t ndim = grad_output.dim(); + TORCH_CHECK(input.dim() == ndim, + __func__, ": Expected dimensions ", input.dim(), " for `grad_output` but got dimensions ", ndim); + TORCH_CHECK((ndim == 3 || ndim == 4), + __func__, ": Expected 3D or 4D tensor, but got ", input.sizes()); + TORCH_CHECK(input.dtype() == grad_output.dtype(), + __func__, ": Expected dtype ", input.dtype(), " for `grad_output` but got dtype ", grad_output.dtype()); + TORCH_CHECK(input.dtype() == grad_input.dtype(), + __func__, ": Expected dtype ", input.dtype(), " for `grad_input` but got dtype ", grad_input.dtype()); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) grad_input.resize_(input.sizes(), input.suggest_memory_format()); grad_input.zero_(); diff --git a/aten/src/ATen/native/AdaptiveAveragePooling3d.cpp b/aten/src/ATen/native/AdaptiveAveragePooling3d.cpp index 4897864a378b..4073d60e8fe3 100644 --- a/aten/src/ATen/native/AdaptiveAveragePooling3d.cpp +++ b/aten/src/ATen/native/AdaptiveAveragePooling3d.cpp @@ -235,6 +235,11 @@ Tensor& adaptive_avg_pool3d_backward_out_cpu_template( auto gradOutput = gradOutput_.contiguous(); adaptive_pool_empty_output_check(gradOutput_, "adaptive_avg_pool3d_backward"); +<<<<<<< HEAD +======= + TORCH_CHECK(input.dim() == gradOutput_.dim(), + __func__, ": Expected dimensions ", input.dim(), " for `gradOutput_` but got dimensions ", gradOutput_.dim()); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) /* sizes */ int64_t sizeD = input.size(-4); diff --git a/aten/src/ATen/native/BatchLinearAlgebra.cpp b/aten/src/ATen/native/BatchLinearAlgebra.cpp index 897e83890c79..a126572d5c19 100644 --- a/aten/src/ATen/native/BatchLinearAlgebra.cpp +++ b/aten/src/ATen/native/BatchLinearAlgebra.cpp @@ -125,11 +125,19 @@ // linear algebra function uses that routine #if AT_BUILD_WITH_LAPACK() +<<<<<<< HEAD +======= +#ifndef _ARMPL_H // ArmPL's `cblas.h` pulls in these prototypes. +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // getrf extern "C" void zgetrf_(int *m, int *n, std::complex *a, int *lda, int *ipiv, int *info); extern "C" void cgetrf_(int *m, int *n, std::complex *a, int *lda, int *ipiv, int *info); extern "C" void dgetrf_(int *m, int *n, double *a, int *lda, int *ipiv, int *info); extern "C" void sgetrf_(int *m, int *n, float *a, int *lda, int *ipiv, int *info); +<<<<<<< HEAD +======= +#endif +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // potrs #if defined(_WIN32) && defined(_M_ARM64) @@ -165,13 +173,25 @@ static inline void spotrs_(char *uplo, int *n, int *nrhs, float *a, int *lda, fl #else +<<<<<<< HEAD +======= +#ifndef _ARMPL_H // ArmPL's `cblas.h` pulls in these prototypes. +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) extern "C" void zpotrs_(char *uplo, int *n, int *nrhs, std::complex *a, int *lda, std::complex *b, int *ldb, int *info); extern "C" void cpotrs_(char *uplo, int *n, int *nrhs, std::complex *a, int *lda, std::complex *b, int *ldb, int *info); extern "C" void dpotrs_(char *uplo, int *n, int *nrhs, double *a, int *lda, double *b, int *ldb, int *info); extern "C" void spotrs_(char *uplo, int *n, int *nrhs, float *a, int *lda, float *b, int *ldb, int *info); +<<<<<<< HEAD + +#endif +======= #endif +#endif + +#ifndef _ARMPL_H // ArmPL's `cblas.h` pulls in these prototypes. +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // potrf extern "C" void zpotrf_(char *uplo, int *n, std::complex *a, int *lda, int *info); extern "C" void cpotrf_(char *uplo, int *n, std::complex *a, int *lda, int *info); @@ -317,6 +337,10 @@ extern "C" void zungqr_(int *m, int *n, int *k, std::complex *a, int *ld extern "C" void cungqr_(int *m, int *n, int *k, std::complex *a, int *lda, std::complex *tau, std::complex *work, int *lwork, int *info); extern "C" void dorgqr_(int *m, int *n, int *k, double *a, int *lda, double *tau, double *work, int *lwork, int *info); extern "C" void sorgqr_(int *m, int *n, int *k, float *a, int *lda, float *tau, float *work, int *lwork, int *info); +<<<<<<< HEAD +======= +#endif +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // ormqr #if defined(_WIN32) && defined(_M_ARM64) @@ -347,11 +371,20 @@ static inline void sormqr_(char *side, char *trans, int *m, int *n, int *k, floa *info = LAPACKE_sormqr_work(LAPACK_COL_MAJOR, *side, *trans, *m, *n, *k, a, *lda, tau, c, *ldc, work, *lwork); } #else +<<<<<<< HEAD +======= +#ifndef _ARMPL_H // ArmPL's `cblas.h` pulls in these prototypes. +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) extern "C" void zunmqr_(char *side, char *trans, int *m, int *n, int *k, std::complex *a, int *lda, std::complex *tau, std::complex *c, int *ldc, std::complex *work, int *lwork, int *info); extern "C" void cunmqr_(char *side, char *trans, int *m, int *n, int *k, std::complex *a, int *lda, std::complex *tau, std::complex *c, int *ldc, std::complex *work, int *lwork, int *info); extern "C" void dormqr_(char *side, char *trans, int *m, int *n, int *k, double *a, int *lda, double *tau, double *c, int *ldc, double *work, int *lwork, int *info); extern "C" void sormqr_(char *side, char *trans, int *m, int *n, int *k, float *a, int *lda, float *tau, float *c, int *ldc, float *work, int *lwork, int *info); #endif +<<<<<<< HEAD +======= +#endif +#ifndef _ARMPL_H // ArmPL's `cblas.h` pulls in these prototypes. +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // syevd extern "C" void zheevd_(char *jobz, char *uplo, int *n, std::complex *a, int *lda, double *w, std::complex *work, int *lwork, double *rwork, int *lrwork, int *iwork, int *liwork, int *info); extern "C" void cheevd_(char *jobz, char *uplo, int *n, std::complex *a, int *lda, float *w, std::complex *work, int *lwork, float *rwork, int *lrwork, int *iwork, int *liwork, int *info); @@ -466,14 +499,26 @@ extern "C" void sgelss_(int *m, int *n, int *nrhs, float *s, float *rcond, int *rank, float *work, int *lwork, int *info); #endif +<<<<<<< HEAD #if AT_BUILD_WITH_BLAS() // trsm +======= +#endif + +#if AT_BUILD_WITH_BLAS() +// trsm +#ifndef _ARMPL_H // ArmPL's `cblas.h` pulls in these prototypes. +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) extern "C" void ztrsm_(char *side, char *uplo, char *trans, char *diag, int *n, int *nrhs, std::complex *alpha, std::complex *a, int *lda, std::complex *b, int *ldb); extern "C" void ctrsm_(char *side, char *uplo, char *trans, char *diag, int *n, int *nrhs, std::complex *alpha, std::complex *a, int *lda, std::complex *b, int *ldb); extern "C" void dtrsm_(char *side, char *uplo, char *trans, char *diag, int *n, int *nrhs, double *alpha, double *a, int *lda, double *b, int *ldb); extern "C" void strsm_(char *side, char *uplo, char *trans, char *diag, int *n, int *nrhs, float *alpha, float *a, int *lda, float *b, int *ldb); #endif +<<<<<<< HEAD +======= +#endif +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) namespace at::meta { @@ -685,7 +730,11 @@ TORCH_META_FUNC(linalg_cholesky_ex)(const Tensor& A, auto ndim = A_shape.size(); // L +<<<<<<< HEAD auto L_strides = at::native::batched_matrix_contiguous_strides(A_shape, /*f-contig*=*/A.device().type() != at::kMPS); +======= + auto L_strides = at::native::batched_matrix_contiguous_strides(A_shape, /*f-contig*=*/true); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) set_output_strided(0, A_shape, L_strides, A.options(), {}); // info @@ -849,10 +898,14 @@ namespace at::native { // linear algebra operations template +<<<<<<< HEAD void lapackCholeskySolve(char uplo, int n, int nrhs, scalar_t *a, int lda, scalar_t *b, int ldb, int *info); template void lapackSymeig(char jobz, char uplo, int n, scalar_t *a, int lda, value_t *w, scalar_t *work, int lwork, value_t *rwork, int *info); +======= +static void lapackCholeskySolve(char uplo, int n, int nrhs, scalar_t *a, int lda, scalar_t *b, int ldb, int *info); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) template<> void lapackLu>(int m, int n, c10::complex *a, int lda, int *ipiv, int *info) { zgetrf_(&m, &n, reinterpret_cast*>(a), &lda, ipiv, info); @@ -2694,20 +2747,30 @@ Tensor& ormqr_out(const Tensor& input, const Tensor& tau, const Tensor& other, b int64_t left_size_condition = left ? -2 : -1; TORCH_CHECK( +<<<<<<< HEAD other.size(left_size_condition) >= tau.size(-1), "torch.ormqr: other.shape[", left_size_condition, "] must be greater than or equal to tau.shape[-1]"); TORCH_CHECK( +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) other.size(left_size_condition) == input.size(-2), "torch.ormqr: other.shape[", left_size_condition, "] must be equal to input.shape[-2]"); TORCH_CHECK( +<<<<<<< HEAD tau.size(-1) <= input.size(-1), "torch.ormqr: tau.shape[-1] must be less than or equal to input.shape[-1]"); +======= + std::min(other.size(left_size_condition), input.size(-1)) == tau.size(-1), + "torch.ormqr: tau.shape[-1] must be equal to min(other.shape[", + left_size_condition, + "], input.shape[-1])"); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) TORCH_CHECK( input.dim() - tau.dim() == 1, @@ -2716,6 +2779,10 @@ Tensor& ormqr_out(const Tensor& input, const Tensor& tau, const Tensor& other, b tau.dim(), " and input.ndim is equal to ", input.dim()); +<<<<<<< HEAD +======= + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) TORCH_CHECK( input.dim() == other.dim(), "torch.ormqr: ", diff --git a/aten/src/ATen/native/BatchLinearAlgebraKernel.cpp b/aten/src/ATen/native/BatchLinearAlgebraKernel.cpp index 8dce552b0e13..8b83b8576431 100644 --- a/aten/src/ATen/native/BatchLinearAlgebraKernel.cpp +++ b/aten/src/ATen/native/BatchLinearAlgebraKernel.cpp @@ -137,6 +137,28 @@ Tensor& cholesky_inverse_kernel_impl(Tensor& result, Tensor& infos, bool upper) } /* +<<<<<<< HEAD +======= + LAPACK query functions return workspace size as floating point value, which means + that it might not be accurately represented if it's size exceed mantissa of the + corresponding type. Fix it by adding 1ULP to the value before casting to it + For more info see https://github.com/pytorch/pytorch/issues/145801#issuecomment-2631781776 +*/ +template +static inline +std::enable_if_t, int> lapack_work_to_int(const T val) { + const auto next_after = std::nextafter(val, std::numeric_limits::infinity()); + return std::max(1, std::ceil(next_after)); +} +template +static inline +std::enable_if_t::value, int> lapack_work_to_int(const T val) { + return lapack_work_to_int(val.real()); +} + + +/* +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) Computes the eigenvalues and eigenvectors of n-by-n matrix 'input'. This is an in-place routine, content of 'input', 'values', 'vectors' is overwritten. 'infos' is an int Tensor containing error codes for each matrix in the batched input. @@ -178,7 +200,11 @@ void apply_linalg_eig(Tensor& values, Tensor& vectors, Tensor& input, Tensor& in lapackEig(jobvl, jobvr, n, input_data, lda, values_data, lvectors_data, ldvl, rvectors_data, ldvr, &work_query, -1, rwork_data, &infos_data[0]); +<<<<<<< HEAD int lwork = std::max(1, static_cast(real_impl(work_query))); +======= + int lwork = lapack_work_to_int(work_query); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) Tensor work = at::empty({lwork}, input.dtype()); auto work_data = work.mutable_data_ptr(); @@ -218,6 +244,11 @@ void linalg_eig_kernel(Tensor& eigenvalues, Tensor& eigenvectors, Tensor& infos, 'compute_eigenvectors' controls whether eigenvectors should be computed. This function doesn't do any error checks and it's assumed that every argument is valid. */ +<<<<<<< HEAD +======= + + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) template void apply_lapack_eigh(const Tensor& values, const Tensor& vectors, const Tensor& infos, bool upper, bool compute_eigenvectors) { #if !AT_BUILD_WITH_LAPACK() @@ -256,8 +287,12 @@ void apply_lapack_eigh(const Tensor& values, const Tensor& vectors, const Tensor lapackSyevd(jobz, uplo, n, vectors_data, lda, values_data, &lwork_query, lwork, &rwork_query, lrwork, &iwork_query, liwork, infos_data); +<<<<<<< HEAD value_t next_after_lw = std::nextafter(real_impl(lwork_query), std::numeric_limits::infinity()); lwork = std::max(1, std::ceil(next_after_lw)); +======= + lwork = lapack_work_to_int(lwork_query); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) Tensor work = at::empty({lwork}, vectors.options()); auto work_data = work.mutable_data_ptr(); @@ -269,8 +304,12 @@ void apply_lapack_eigh(const Tensor& values, const Tensor& vectors, const Tensor Tensor rwork; value_t* rwork_data = nullptr; if (vectors.is_complex()) { +<<<<<<< HEAD value_t next_after_rwork_query = std::nextafter(rwork_query, std::numeric_limits::infinity()); lrwork = std::max(1, std::ceil(next_after_rwork_query)); +======= + lrwork = lapack_work_to_int(rwork_query); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) rwork = at::empty({lrwork}, values.options()); rwork_data = rwork.mutable_data_ptr(); } @@ -331,7 +370,10 @@ static void apply_geqrf(const Tensor& input, const Tensor& tau) { "Calling torch.geqrf on a CPU tensor requires compiling ", "PyTorch with LAPACK. Please use PyTorch built with LAPACK support."); #else +<<<<<<< HEAD using value_t = typename c10::scalar_value_type::type; +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) auto input_data = input.data_ptr(); auto tau_data = tau.data_ptr(); auto input_matrix_stride = matrixStride(input); @@ -353,7 +395,11 @@ static void apply_geqrf(const Tensor& input, const Tensor& tau) { // if lwork is less than 'n' then a warning is printed: // Intel MKL ERROR: Parameter 7 was incorrect on entry to SGEQRF. +<<<<<<< HEAD lwork = std::max({1, static_cast(n), static_cast(real_impl(wkopt))}); +======= + lwork = std::max(static_cast(n), lapack_work_to_int(wkopt)); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) Tensor work = at::empty({lwork}, input.options()); for (const auto i : c10::irange(batch_size)) { @@ -401,7 +447,10 @@ inline void apply_orgqr(Tensor& self, const Tensor& tau) { return; } +<<<<<<< HEAD using value_t = typename c10::scalar_value_type::type; +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) auto self_data = self.data_ptr(); auto tau_data = tau.const_data_ptr(); auto self_matrix_stride = matrixStride(self); @@ -425,7 +474,11 @@ inline void apply_orgqr(Tensor& self, const Tensor& tau) { scalar_t wkopt; lapackOrgqr(m, n, k, self_data, lda, const_cast(tau_data), &wkopt, lwork, &info); TORCH_INTERNAL_ASSERT_DEBUG_ONLY(info == 0); +<<<<<<< HEAD lwork = std::max(1, real_impl(wkopt)); +======= + lwork = lapack_work_to_int(wkopt); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) Tensor work = at::empty({lwork}, self.options()); for (const auto i : c10::irange(batch_size)) { @@ -544,7 +597,11 @@ void apply_lstsq(const Tensor& A, Tensor& B, Tensor& rank, Tensor& singular_valu s_working_ptr, &iwork_opt); +<<<<<<< HEAD lwork = std::max(1, real_impl(work_opt)); +======= + lwork = lapack_work_to_int(work_opt); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) Tensor work = at::empty({lwork}, A.options()); scalar_t* work_data = work.mutable_data_ptr(); @@ -1066,7 +1123,11 @@ static void apply_svd(const Tensor& A, { scalar_t wkopt; lapackSvd(jobz, m, n, A_data, lda, S_data, U_data, ldu, Vh_data, ldvh, &wkopt, lwork, rwork_data, iwork_data, info_data); +<<<<<<< HEAD lwork = std::max(1, real_impl(wkopt)); +======= + lwork = lapack_work_to_int(wkopt); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } auto work = std::vector(lwork); auto* const work_data = work.data(); diff --git a/aten/src/ATen/native/BinaryOps.cpp b/aten/src/ATen/native/BinaryOps.cpp index 7e4e77a67a8a..2f48e1e34ada 100644 --- a/aten/src/ATen/native/BinaryOps.cpp +++ b/aten/src/ATen/native/BinaryOps.cpp @@ -1383,35 +1383,59 @@ Tensor bitwise_right_shift(const Scalar& self, const Tensor& other) { } template +<<<<<<< HEAD Tensor& comparison_op_out(Tensor& result, const Tensor& self, const Tensor& other, Stub& stub) { +======= +static Tensor& comparison_op_out(Tensor& result, const Tensor& self, const Tensor& other, Stub& stub) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) auto iter = TensorIterator::comparison_op(result, self, other); stub(iter.device_type(), iter); return result; } template +<<<<<<< HEAD Tensor comparison_op(const Tensor& self, const Tensor& other, OutImpl& out_impl) { +======= +static Tensor comparison_op(const Tensor& self, const Tensor& other, OutImpl& out_impl) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) Tensor result = at::empty({0}, self.options().dtype(kBool)); return out_impl(result, self, other); } template +<<<<<<< HEAD Tensor& comparison_op_(Tensor& self, const Tensor& other, OutImpl& out_impl) { +======= +static Tensor& comparison_op_(Tensor& self, const Tensor& other, OutImpl& out_impl) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return out_impl(self, self, other); } template +<<<<<<< HEAD Tensor& comparison_op_out(Tensor& result, const Tensor& self, const Scalar& other, OutImpl& out_impl) { +======= +static Tensor& comparison_op_out(Tensor& result, const Tensor& self, const Scalar& other, OutImpl& out_impl) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return out_impl(result, self, wrapped_scalar_tensor(other)); } template +<<<<<<< HEAD Tensor comparison_op(const Tensor& self, const Scalar& other, OutImpl& out_impl) { +======= +static Tensor comparison_op(const Tensor& self, const Scalar& other, OutImpl& out_impl) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return comparison_op(self, wrapped_scalar_tensor(other), out_impl); } template +<<<<<<< HEAD Tensor& comparison_op_(Tensor& self, const Scalar& other, OutImpl& out_impl) { +======= +static Tensor& comparison_op_(Tensor& self, const Scalar& other, OutImpl& out_impl) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return out_impl(self, self, wrapped_scalar_tensor(other)); } diff --git a/aten/src/ATen/native/Blas.cpp b/aten/src/ATen/native/Blas.cpp index f62c31777822..3c40eaefe5ff 100644 --- a/aten/src/ATen/native/Blas.cpp +++ b/aten/src/ATen/native/Blas.cpp @@ -7,6 +7,14 @@ #include #include +<<<<<<< HEAD +======= +#include +#include +#if !defined(__s390x__) && !defined(__powerpc__) +#include +#endif +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #ifndef AT_PER_OPERATOR_HEADERS #include @@ -24,6 +32,12 @@ #include #include #include +<<<<<<< HEAD +======= +#include +#include +#include +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #endif namespace at::meta { @@ -222,4 +236,109 @@ Tensor vdot(const Tensor &self, const Tensor &other){ } +<<<<<<< HEAD +======= +static Tensor& +_scaled_mm_out_cpu_emulated(const Tensor& mat1, const Tensor& mat2, + const Tensor& scale_a, + const Tensor& scale_b, + const std::optional& bias, + const std::optional& scale_result, + std::optional out_dtype, + bool use_fast_accum, + Tensor& out) { + TORCH_CHECK(mat1.dim() == 2, "mat1 must be a matrix"); + TORCH_CHECK(mat2.dim() == 2, "mat2 must be a matrix"); + TORCH_CHECK( + mat1.sizes()[1] == mat2.sizes()[0], "mat1 and mat2 shapes cannot be multiplied (", + mat1.sizes()[0], "x", mat1.sizes()[1], " and ", mat2.sizes()[0], "x", mat2.sizes()[1], ")"); + + TORCH_INTERNAL_ASSERT((scale_a.numel() == 1 && scale_b.numel() == 1), "Now _scaled_mm only supports per-tensor scaling for CPU backend."); + TORCH_CHECK( + !scale_result || + (scale_result->numel() == 1 && scale_result->scalar_type() == kFloat), + "scale_result must be a float scalar"); + TORCH_CHECK(!bias || bias->numel() == mat2.sizes()[1], "Bias must be size ", mat2.sizes()[1], + " but got ", bias->numel()); + + // Check types + TORCH_CHECK(!out_dtype || *out_dtype == out.scalar_type(), "out_dtype must match output matrix type"); + TORCH_CHECK(isFloat8Type(mat1.scalar_type()), "Expected mat1 to be Float8 matrix got ", mat1.scalar_type()); + TORCH_CHECK(isFloat8Type(mat2.scalar_type()), "Expected mat2 to be Float8 matrix got ", mat2.scalar_type()); + + auto mat1_c = mat1.contiguous(); + auto mat2_c = mat2.contiguous(); + IntArrayRef mat1_sizes = mat1_c.sizes(); + IntArrayRef mat2_sizes = mat2_c.sizes(); + at::native::resize_output(out, {mat1_sizes[0], mat2_sizes[1]}); + + float input_scale = scale_a.item(); + float weight_scale = scale_b.item(); + float output_scale = float(1.0); + if (scale_result.has_value() && + (*out_dtype == ScalarType::Float8_e4m3fn || + *out_dtype == ScalarType::Float8_e5m2)) { + output_scale = scale_result.value().item(); + } + auto fp32_mat1 = at::mul(mat1.to(kFloat), input_scale); + auto fp32_mat2 = at::mul(mat2_c.to(kFloat), weight_scale); + auto out_tmp = at::matmul(fp32_mat1, fp32_mat2); + if (bias) { + out_tmp.add_(bias.value()); + } + if (*out_dtype == ScalarType::Float8_e4m3fn || + *out_dtype == ScalarType::Float8_e5m2) { + out_tmp = at::mul(out_tmp, 1 / output_scale); + } + out_tmp = out_tmp.to(out.scalar_type()); + out.copy_(out_tmp); + return out; +} + +Tensor& +_scaled_mm_out_cpu(const Tensor& mat1, const Tensor& mat2, + const Tensor& scale_a, + const Tensor& scale_b, + const std::optional& bias, + const std::optional& scale_result, + std::optional out_dtype, + bool use_fast_accum, + Tensor& out) { +#if AT_MKLDNN_ENABLED() && !defined(__powerpc__) + if (at::globalContext().userEnabledMkldnn()) { + bool mixed_dtype = mat1.scalar_type() != mat2.scalar_type(); + if ((!mixed_dtype && cpuinfo_has_x86_amx_int8()) || + (mixed_dtype && cpuinfo_has_x86_amx_fp16())) { + return mkldnn_scaled_mm( + mat1, + mat2, + scale_a, + scale_b, + bias, + scale_result, + out_dtype, + use_fast_accum, + out); + } + } +#endif + { + return _scaled_mm_out_cpu_emulated(mat1, mat2, scale_a, scale_b, bias, scale_result, out_dtype, use_fast_accum, out); + } +} + +Tensor +_scaled_mm_cpu(const Tensor& mat_a, const Tensor& mat_b, + const Tensor& scale_a, + const Tensor& scale_b, + const std::optional& bias, + const std::optional& scale_result, + std::optional out_dtype, + bool use_fast_accum) { + const auto out_dtype_ = out_dtype.value_or(mat_a.scalar_type()); + Tensor out = at::empty({0}, mat_a.options().dtype(out_dtype_)); + return _scaled_mm_out_cpu(mat_a, mat_b, scale_a, scale_b, bias, scale_result, out_dtype, use_fast_accum, out); +} + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } // namespace at::native diff --git a/aten/src/ATen/native/BlasKernel.cpp b/aten/src/ATen/native/BlasKernel.cpp index 58cc456254d8..3f7171a8fadd 100644 --- a/aten/src/ATen/native/BlasKernel.cpp +++ b/aten/src/ATen/native/BlasKernel.cpp @@ -33,11 +33,19 @@ T* remove_const(const T* x) { } // namespace #if AT_BUILD_WITH_BLAS() +<<<<<<< HEAD +======= +#ifndef _ARMPL_H +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) extern "C" double ddot_(int *n, double *x, int *incx, double *y, int *incy); extern "C" void dscal_(int *n, double *a, double *x, int *incx); extern "C" void sscal_(int *n, float *a, float *x, int *incx); extern "C" void dgemv_(char *trans, int *m, int *n, double *alpha, double *a, int *lda, double *x, int *incx, double *beta, double *y, int *incy); extern "C" void sgemv_(char *trans, int *m, int *n, float *alpha, float *a, int *lda, float *x, int *incx, float *beta, float *y, int *incy); +<<<<<<< HEAD +======= +#endif +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #if AT_BLAS_F2C() # define ffloat double @@ -52,10 +60,18 @@ extern "C" void sgemv_(char *trans, int *m, int *n, float *alpha, float *a, int extern "C" void cblas_cdotc_sub(const int n, const void *x, const int incx, const void *y, const int incy, void *dotc); extern "C" void cblas_zdotc_sub(const int n, const void *x, const int incx, const void *y, const int incy, void *dotc); +<<<<<<< HEAD +======= +#ifndef _ARMPL_H +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) static inline ffloat sdot_(const int *n, const float *x, const int *incx, const float *y, const int *incy) { return cblas_sdot(*n, x, *incx, y, *incy); } +<<<<<<< HEAD +======= +#endif +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) static inline void cdotu_(std::complex *res, const int *n, const std::complex *x, const int *incx, const std::complex *y, const int *incy) { cblas_cdotu_sub(*n, x, *incx, y, *incy, res); @@ -86,6 +102,11 @@ namespace at::native { #if !defined(C10_MOBILE) DEFINE_DISPATCH(fp16_gemv_trans_stub); DEFINE_DISPATCH(bf16_gemv_trans_stub); +<<<<<<< HEAD +======= +DEFINE_DISPATCH(fp16_dot_stub); +DEFINE_DISPATCH(bf16_dot_stub); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #endif // !defined(C10_MOBILE) namespace blas_impl { @@ -116,6 +137,7 @@ void fp16_gemv_trans( fp16_gemv_trans_stub(kCPU, m, n, alpha, a, lda, x, incx, beta, y, incy); } +<<<<<<< HEAD void bf16_gemv_trans( const int m, const int n, @@ -127,10 +149,67 @@ void bf16_gemv_trans( const at::BFloat16 beta, at::BFloat16* y, const int incy); +======= +static float fp16_dot( + const int64_t n, + const Half* x, + const int64_t incx, + const Half* y, + const int64_t incy) { + return fp16_dot_stub(kCPU, n, x, incx, y, incy); +} + +static float bf16_dot( + const int64_t n, + const BFloat16* x, + const int64_t incx, + const BFloat16* y, + const int64_t incy) { + return bf16_dot_stub(kCPU, n, x, incx, y, incy); +} +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #endif // !defined(C10_MOBILE) #if defined(__aarch64__) && !defined(C10_MOBILE) +<<<<<<< HEAD +======= +#ifdef __ARM_FEATURE_FP16_SCALAR_ARITHMETIC +static void fp16_gemv_notrans_fp16_arith(int m, int n, const float16_t* a, const int lda, const float16_t *x, float16_t *y) { + for (auto j = 0; j < n; j++) { + auto vecCol = vdup_n_f16(x[j]); + const auto* column = a + lda * j; + for (auto i = 0; i < m; i += 4) { + auto yf16 = y + i; + auto matRow = vld1_f16(column + i); + auto resVec = j != 0 ? vld1_f16(yf16) : vdup_n_f16(0); + resVec = vfma_lane_f16(resVec, matRow, vecCol, 0); + vst1_f16(yf16, resVec); + } + } +} +#endif + +static void fp16_gemv_notrans_fp32_arith(int m, int n, const float16_t* a, const int lda, const float16_t *x, float16_t *y) { + std::vector sum(m); + for (auto j = 0; j < n; j++) { + auto vecCol = vdup_n_f32(x[j]); + const auto* column = a + lda * j; + for (auto i = 0; i < m; i += 4) { + auto sf32 = sum.data() + i; + auto matRow = vcvt_f32_f16(vld1_f16(column + i)); + auto resVec = j != 0 ? vld1q_f32(sf32) : vdupq_n_f32(0); + resVec = vfmaq_lane_f32(resVec, matRow, vecCol, 0); + vst1q_f32(sf32, resVec); + } + } + + for (auto i = 0; i < m; i+= 4) { + vst1_f16(y + i, vcvt_f16_f32(vld1q_f32(sum.data() + i))); + } +} + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) void fp16_gemv_notrans( const int m, const int n, @@ -143,17 +222,66 @@ void fp16_gemv_notrans( Half* y, const int incy); +<<<<<<< HEAD #endif // defined(__aarch64__) && !defined(C10_MOBILE) template bool scal_use_fast_path( +======= +void fp16_gemv_notrans( + const int m, + const int n, + const float alpha, + const Half* a, + const int lda, + const Half* x, + const int incx, + const float beta, + Half* y, + const int incy) { + if (incx == 1 && alpha == 1.0 && beta == 0.0 && m % 4 == 0 && incy == 1) { +#ifdef __ARM_FEATURE_FP16_SCALAR_ARITHMETIC + if (at::globalContext().allowFP16ReductionCPU()) { + return fp16_gemv_notrans_fp16_arith(m, n, reinterpret_cast(a), lda, reinterpret_cast(x), reinterpret_cast(y)); + } +#endif + return fp16_gemv_notrans_fp32_arith(m, n, reinterpret_cast(a), lda, reinterpret_cast(x), reinterpret_cast(y)); + } + std::vector sum(m); + for (const auto j : c10::irange(n)) { + const auto* column_ = a + lda * j; + auto z = alpha * x[j * incx]; + for (const auto i : c10::irange(m)) { + sum[i] += z * column_[i]; + } + } + if (beta == 0.0) { + for (const auto i : c10::irange(m)) { + y[i * incy] = sum[i]; + } + } else { + for (const auto i : c10::irange(m)) { + y[i * incy] += sum[i]; + } + } +} + +#endif // defined(__aarch64__) && !defined(C10_MOBILE) + +template +static bool scal_use_fast_path( +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) [[maybe_unused]] int64_t n, [[maybe_unused]] int64_t incx) { return false; } template +<<<<<<< HEAD bool gemv_use_fast_path( +======= +static bool gemv_use_fast_path( +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) [[maybe_unused]] char trans, [[maybe_unused]] int64_t m, [[maybe_unused]] int64_t n, @@ -166,7 +294,11 @@ bool gemv_use_fast_path( } template +<<<<<<< HEAD void scal_fast_path( +======= +static void scal_fast_path( +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) [[maybe_unused]] int* n, [[maybe_unused]] scalar_t* a, [[maybe_unused]] scalar_t* x, @@ -176,7 +308,11 @@ void scal_fast_path( } template +<<<<<<< HEAD void gemv_fast_path( +======= +static void gemv_fast_path( +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) [[maybe_unused]] const char* trans, [[maybe_unused]] const int* m, [[maybe_unused]] const int* n, @@ -258,10 +394,13 @@ template <> void gemv_fast_path(const char *trans, const int *m, const int *n, const float *alpha, const float *a, const int *lda, const float *x, const int *incx, const float *beta, float *y, const int *incy) { sgemv_(remove_const(trans), remove_const(m), remove_const(n), remove_const(alpha), remove_const(a), remove_const(lda), remove_const(x), remove_const(incx), remove_const(beta), y, remove_const(incy)); } +<<<<<<< HEAD #else INSTANTIATE(float) INSTANTIATE(double) #endif // AT_BUILD_WITH_BLAS +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) INSTANTIATE(uint8_t) INSTANTIATE(int8_t) @@ -283,7 +422,11 @@ bool gemv_use_fast_path( beta == 0.0; } +<<<<<<< HEAD void bf16_gemv_trans( +======= +static void bf16_gemv_trans( +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) const int m, const int n, const at::BFloat16 alpha, @@ -368,6 +511,7 @@ void gemv_fast_path( y, *incy); } +<<<<<<< HEAD #else template <> bool scal_use_fast_path( @@ -376,6 +520,9 @@ bool scal_use_fast_path( return false; } +======= +#else // !defined(__aarch64__)) +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) template <> bool gemv_use_fast_path( char trans, @@ -391,6 +538,7 @@ bool gemv_use_fast_path( (c10::detail::fp16_from_bits(beta.x) == 0.0f || trans == 't' || trans == 'T'); } +<<<<<<< HEAD #ifdef __ARM_FEATURE_FP16_SCALAR_ARITHMETIC static void fp16_gemv_notrans_fp16_arith(int m, int n, const float16_t* a, const int lda, const float16_t *x, float16_t *y) { for (auto j = 0; j < n; j++) { @@ -464,6 +612,8 @@ void fp16_gemv_notrans( } } +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) template <> void gemv_fast_path( const char* trans, @@ -511,6 +661,10 @@ void gemv_fast_path( INSTANTIATE(c10::Half) INSTANTIATE(c10::BFloat16) #endif // !defined(C10_MOBILE) +<<<<<<< HEAD +======= +#endif // AT_BUILD_WITH_BLAS +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #undef INSTANTIATE } // namespace blas_impl @@ -559,7 +713,11 @@ void gemv(char trans, int64_t m, int64_t n, scalar_t alpha, const scalar_t *a, i opmath_t sum = 0; const scalar_t *row_ = a + lda * i; for (const auto j : c10::irange(m)) { +<<<<<<< HEAD sum += x[j * incx] * row_[j]; +======= + sum += static_cast(x[j * incx]) * static_cast(row_[j]); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } if (beta == scalar_t(0)) { y[i * incy] = alpha * sum; @@ -690,7 +848,11 @@ scalar_t dot_impl(int64_t n, const scalar_t* x, int64_t incx, const scalar_t* y, incx = 1; incy = 1; } +<<<<<<< HEAD return blas_impl::dot_naive(n, x, incx, y, incy, std::multiplies{}); +======= + return blas_impl::dot_naive(n, x, incx, y, incy, std::multiplies>{}); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } template <> @@ -713,6 +875,37 @@ c10::complex dot_impl(int64_t n, const c10::complex* x, int64_t in return dot_impl_floating(n, x, incx, y, incy); } +<<<<<<< HEAD +======= +template <> +Half dot_impl(int64_t n, const Half* x, int64_t incx, const Half* y, int64_t incy) { + if (n == 1) { + incx = 1; + incy = 1; + } +#if !defined(C10_MOBILE) + if (incx == 1 && incy == 1) { + return blas_impl::fp16_dot(n, x, incx, y, incy); + } +#endif // !defined(C10_MOBILE) + return blas_impl::dot_naive(n, x, incx, y, incy, std::multiplies{}); +} + +template <> +BFloat16 dot_impl(int64_t n, const BFloat16* x, int64_t incx, const BFloat16* y, int64_t incy) { + if (n == 1) { + incx = 1; + incy = 1; + } +#if !defined(C10_MOBILE) + if (incx == 1 && incy == 1) { + return blas_impl::bf16_dot(n, x, incx, y, incy); + } +#endif // !defined(C10_MOBILE) + return blas_impl::dot_naive(n, x, incx, y, incy, std::multiplies{}); +} + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) namespace { template struct vdot_op { @@ -739,7 +932,11 @@ scalar_t vdot_impl(int64_t n, const scalar_t* x, int64_t incx, const scalar_t* y #endif } +<<<<<<< HEAD // Skip reinstantiating the explicitly specialized types `float` and `double`. +======= +// Skip reinstantiating the explicitly specialized types `float`, `double`, `half` & `bfloat16`. +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #define INSTANTIATE_DOT_IMPL(scalar_t) \ template scalar_t dot_impl( \ int64_t n, const scalar_t * x, int64_t incx, const scalar_t * y, int64_t incy); @@ -748,8 +945,11 @@ INSTANTIATE_DOT_IMPL(int8_t) INSTANTIATE_DOT_IMPL(int16_t) INSTANTIATE_DOT_IMPL(int) INSTANTIATE_DOT_IMPL(int64_t) +<<<<<<< HEAD INSTANTIATE_DOT_IMPL(c10::Half) INSTANTIATE_DOT_IMPL(c10::BFloat16) +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #define INSTANTIATE_VDOT_IMPL(scalar_t) \ template scalar_t vdot_impl( \ diff --git a/aten/src/ATen/native/CPUBlas.cpp b/aten/src/ATen/native/CPUBlas.cpp index fb401f076797..72c9ba221abb 100644 --- a/aten/src/ATen/native/CPUBlas.cpp +++ b/aten/src/ATen/native/CPUBlas.cpp @@ -15,7 +15,11 @@ #if AT_BUILD_WITH_BLAS() #if C10_IOS #include +<<<<<<< HEAD #else +======= +#elif !defined(_ARMPL_H) +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) extern "C" void dgemm_(char *transa, char *transb, int *m, int *n, int *k, double *alpha, const double *a, int *lda, const double *b, int *ldb, double *beta, double *c, int *ldc); extern "C" void sgemm_(char *transa, char *transb, int *m, int *n, int *k, float *alpha, const float *a, int *lda, const float *b, int *ldb, float *beta, float *c, int *ldc); extern "C" void cgemm_(char *transa, char *transb, int *m, int *n, int *k, void *alpha, const void *a, int *lda, const void *b, int *ldb, void *beta, void *c, int *ldc); @@ -135,6 +139,10 @@ CBLAS_TRANSPOSE to_apple_accelerate_transpose(TransposeType trans) { } // namespace (anonymous) DEFINE_DISPATCH(gemm_stub); +<<<<<<< HEAD +======= +DEFINE_DISPATCH(gemm_no_downcast_stub); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) void gemm( TransposeType transa, TransposeType transb, @@ -179,6 +187,21 @@ void gemm( transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } +<<<<<<< HEAD +======= +#ifndef armpl_doublecomplex_t +#define COMPLEX_DBL(a) a +#define COMPLEX_DBL_CONST(a) a +#define COMPLEX_FLOAT(a) a +#define COMPLEX_FLOAT_CONST(a) a +#else +#define COMPLEX_DBL(a) ((armpl_doublecomplex_t*)a) +#define COMPLEX_DBL_CONST(a) ((const armpl_doublecomplex_t*)a) +#define COMPLEX_FLOAT(a) ((armpl_singlecomplex_t*)a) +#define COMPLEX_FLOAT_CONST(a) ((const armpl_singlecomplex_t*)a) +#endif + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) void gemm( TransposeType transa, TransposeType transb, int64_t m, int64_t n, int64_t k, @@ -256,11 +279,19 @@ void gemm( zgemm_( &transa_, &transb_, &m_, &n_, &k_, +<<<<<<< HEAD &alpha_, a, &lda_, b, &ldb_, &beta_, c, &ldc_); +======= + COMPLEX_DBL_CONST(&alpha_), + COMPLEX_DBL_CONST(a), &lda_, + COMPLEX_DBL_CONST(b), &ldb_, + COMPLEX_DBL_CONST(&beta_), + COMPLEX_DBL(c), &ldc_); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #endif return; } @@ -299,11 +330,19 @@ void gemm( cgemm_( &transa_, &transb_, &m_, &n_, &k_, +<<<<<<< HEAD &alpha_, a, &lda_, b, &ldb_, &beta_, c, &ldc_); +======= + COMPLEX_FLOAT_CONST(&alpha_), + COMPLEX_FLOAT_CONST(a), &lda_, + COMPLEX_FLOAT_CONST(b), &ldb_, + COMPLEX_FLOAT_CONST(&beta_), + COMPLEX_FLOAT(c), &ldc_); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #endif return; } @@ -322,6 +361,27 @@ void gemm( const float beta, at::BFloat16 *c, int64_t ldc) { internal::normalize_last_dims(transa, transb, m, n, k, &lda, &ldb, &ldc); +<<<<<<< HEAD +======= +#if AT_MKLDNN_ENABLED() +#ifdef __aarch64__ + // MKLDNN also supports ARM for bf16, and the bypass is only + // currently intended for x86/x86_64. + const bool use_bf16_gemv_trans = false; +#elif defined(__powerpc__) + const bool use_bf16_gemv_trans = false; +#else + const bool bf16_gemv_trans_would_be_faster = cpuinfo_initialize() && + !cpuinfo_has_x86_avx512bf16(); + const bool use_bf16_gemv_trans = bf16_gemv_trans_would_be_faster && + transa == TransposeType::Transpose && + transb == TransposeType::NoTranspose && n == 1 && alpha == 1.0; +#endif + if (!use_bf16_gemv_trans && mkldnn_bf16_gemm(transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc)) { + return; + } +#endif +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #if AT_BUILD_WITH_BLAS() && defined(BLAS_HAS_SBGEMM) if (use_blas_gemm(transa, transb, m, n, k, lda, ldb, ldc)) { int m_ = m, n_ = n, k_ = k, lda_ = lda, ldb_ = ldb, ldc_ = ldc; @@ -343,6 +403,7 @@ void gemm( return; } #endif +<<<<<<< HEAD #if AT_MKLDNN_ENABLED() #ifdef __aarch64__ // MKLDNN also supports ARM for bf16, and the bypass is only @@ -361,6 +422,8 @@ void gemm( return; } #endif +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) gemm_stub( at::kCPU, at::kBFloat16, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); @@ -423,6 +486,16 @@ void gemm( return; } #endif +<<<<<<< HEAD +======= +#if AT_MKLDNN_ACL_ENABLED() +// add heuristic based on shape to dispatch to sbgemm_ vs MKLDNN + if (mkldnn_bf16f32_gemm(transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc)) { + return; + } +#endif //AT_MKLDNN_ACL_ENABLED + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #ifdef MKL_HAS_SBGEMM if (use_blas_gemm(transa, transb, m, n, k, lda, ldb, ldc)) { int m_ = m, n_ = n, k_ = k, lda_ = lda, ldb_ = ldb, ldc_ = ldc; @@ -433,18 +506,31 @@ void gemm( // for the fallback path, first compute gemm with beta = 0, // and then add c in full precision. int64_t c_size = n * m; +<<<<<<< HEAD std::vector bfloat_c(c_size, 0.f); gemm_stub( at::kCPU, at::kBFloat16, transa, transb, m, n, k, alpha, a, lda, b, ldb, 0.f, bfloat_c.data(), m); +======= + std::vector float_c(c_size, 0.f); + gemm_no_downcast_stub( + at::kCPU, at::kBFloat16, + transa, transb, m, n, k, alpha, a, lda, b, ldb, 0.f, float_c.data(), m); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) for (const auto j : c10::irange(n)) { for (const auto i : c10::irange(m)) { auto offset = j * ldc + i; // beta == 0 won't propagate NaN from C if (beta == 0.f) { +<<<<<<< HEAD c[offset] = c10::convert(bfloat_c[j * m + i]); } else { c[offset] = beta * c[offset] + c10::convert(bfloat_c[j * m + i]); +======= + c[offset] = float_c[j * m + i]; + } else { + c[offset] = beta * c[offset] + float_c[j * m + i]; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } } } @@ -554,7 +640,11 @@ using is_blas_library_type = std::integral_constant>>; template +<<<<<<< HEAD void gemm_batched_generic( +======= +static void gemm_batched_generic( +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) TransposeType transa, TransposeType transb, int64_t batch_size, int64_t m, int64_t n, int64_t k, scalar_t alpha, @@ -568,7 +658,11 @@ void gemm_batched_generic( } template +<<<<<<< HEAD void gemm_batched( +======= +static void gemm_batched( +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) TransposeType transa, TransposeType transb, int64_t batch_size, int64_t m, int64_t n, int64_t k, scalar_t alpha, @@ -596,7 +690,11 @@ void gemm_batched( } template +<<<<<<< HEAD void gemm_batched_with_stride_generic( +======= +static void gemm_batched_with_stride_generic( +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) TransposeType transa, TransposeType transb, int64_t batch_size, int64_t m, int64_t n, int64_t k, scalar_t alpha, @@ -739,7 +837,11 @@ void axpy(int64_t n, c10::complex a, const c10::complex *x, int6 #if C10_IOS cblas_zaxpy(i_n, &a, x, i_incx, y, i_incy); #else +<<<<<<< HEAD zaxpy_(&i_n, &a, x, &i_incx, y, &i_incy); +======= + zaxpy_(&i_n, COMPLEX_DBL(&a), COMPLEX_DBL_CONST(x), &i_incx, COMPLEX_DBL(y), &i_incy); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #endif return; } @@ -764,7 +866,11 @@ void axpy(int64_t n, c10::complex a, const c10::complex *x, int64_ #if C10_IOS cblas_caxpy(i_n, &a, x, i_incx, y, i_incy); #else +<<<<<<< HEAD caxpy_(&i_n, &a, x, &i_incx, y, &i_incy); +======= + caxpy_(&i_n, COMPLEX_FLOAT(&a), COMPLEX_FLOAT_CONST(x), &i_incx, COMPLEX_FLOAT(y), &i_incy); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #endif return; } @@ -838,7 +944,11 @@ void copy(int64_t n, const c10::complex *x, int64_t incx, c10::complex>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #endif return; } @@ -862,7 +972,11 @@ void copy(int64_t n, const c10::complex *x, int64_t incx, c10::complex>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #endif return; } @@ -945,7 +1059,11 @@ struct PackKey { } }; +<<<<<<< HEAD inline dnnl::memory::data_type get_dnnl_dtype(ScalarType dtype) { +======= +static inline dnnl::memory::data_type get_dnnl_dtype(ScalarType dtype) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if (dtype == ScalarType::Float) { return dnnl::memory::data_type::f32; } else if (dtype == ScalarType::BFloat16) { @@ -1347,6 +1465,33 @@ void brgemm( "I8 Brgemm is only supported on X64 when oneDNN ukernel is enabled and `amx` is supported"); } +<<<<<<< HEAD +======= +void brgemm( + int64_t M, + int64_t N, + int64_t K, + int64_t ld_a, + int64_t ld_b, + int64_t ld_c, + const bool add_C, + const signed char* A, + const signed char* B, + int32_t* C, + bool is_vnni) { +#if defined(ONEDNN_UKERNEL_ENABLED) + if (is_vnni && Brgemm::device_check(ScalarType::Char)) { + Brgemm::call( + M, N, K, ld_a, ld_b, ld_c, add_C, A, B, C); + return; + } +#endif + // raise an error if the path is not supported + TORCH_CHECK(false, + "I8 Brgemm is only supported on X64 when oneDNN ukernel is enabled and `amx` is supported"); +} + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) void brgemm_release(bool is_vnni) { #if defined(ONEDNN_UKERNEL_ENABLED) if (is_vnni) { diff --git a/aten/src/ATen/native/CPUBlas.h b/aten/src/ATen/native/CPUBlas.h index c1045f78c430..f0f72a80df3d 100644 --- a/aten/src/ATen/native/CPUBlas.h +++ b/aten/src/ATen/native/CPUBlas.h @@ -29,6 +29,21 @@ using gemm_fn = void(*)( DECLARE_DISPATCH(gemm_fn, gemm_stub) +<<<<<<< HEAD +======= +using gemm_no_downcast_fn = void(*)( + at::ScalarType type, + TransposeType transa, TransposeType transb, + int64_t m, int64_t n, int64_t k, + const Scalar& alpha, + const void *a, int64_t lda, + const void *b, int64_t ldb, + const Scalar& beta, + void *c, int64_t ldc); + +DECLARE_DISPATCH(gemm_no_downcast_fn, gemm_no_downcast_stub) + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) template void gemm( TransposeType transa, TransposeType transb, @@ -259,6 +274,22 @@ TORCH_API void brgemm( int32_t* C, bool is_vnni = true); +<<<<<<< HEAD +======= +TORCH_API void brgemm( + int64_t M, + int64_t N, + int64_t K, + int64_t ld_a, + int64_t ld_b, + int64_t ld_c, + const bool add_C, + const signed char* A, + const signed char* B, + int32_t* C, + bool is_vnni = true); + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // Release brgemm hardware context TORCH_API void brgemm_release(bool is_vnni = true); diff --git a/aten/src/ATen/native/CPUFallback.cpp b/aten/src/ATen/native/CPUFallback.cpp index fd850846ba61..0dd83a5bcada 100644 --- a/aten/src/ATen/native/CPUFallback.cpp +++ b/aten/src/ATen/native/CPUFallback.cpp @@ -98,6 +98,7 @@ void cpu_fallback(const c10::OperatorHandle& op, torch::jit::Stack* stack, bool const auto arguments_begin = stack->size() - num_arguments; std::vector tensor_args; +<<<<<<< HEAD std::vector tensor_args_indices; std::vector> tensorlist_args; @@ -105,6 +106,15 @@ void cpu_fallback(const c10::OperatorHandle& op, torch::jit::Stack* stack, bool std::vector>> optional_tensorlist_args; std::vector optional_tensorlist_args_indices; +======= + std::vector tensor_args_indices; + + std::vector> tensorlist_args; + std::vector tensorlist_args_indices; + + std::vector>> optional_tensorlist_args; + std::vector optional_tensorlist_args_indices; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) std::optional tgt_device = std::nullopt; // save converted cpu tensor for TensorList and optional TensorList diff --git a/aten/src/ATen/native/ChanelShuffle.cpp b/aten/src/ATen/native/ChanelShuffle.cpp index be57917967fa..4b15f480b361 100644 --- a/aten/src/ATen/native/ChanelShuffle.cpp +++ b/aten/src/ATen/native/ChanelShuffle.cpp @@ -20,6 +20,20 @@ namespace at::native { Tensor channel_shuffle_cpu(const Tensor& self, int64_t groups) { +<<<<<<< HEAD +======= + TORCH_CHECK(self.dim() > 2, + "channel_shuffle expects input with > 2 dims, but got input with sizes ", + self.sizes()); + int64_t c = self.size(1); + TORCH_CHECK(groups > 0, + "Number of groups to divide channels in must be positive.", + " Value of groups:", groups); + TORCH_CHECK((c % groups) == 0, + "Number of channels must be divisible by groups. Got ", + c, " channels and ", groups, " groups."); + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) Tensor output; if (self.numel() == 0) { output = self.alias(); diff --git a/aten/src/ATen/native/ComparisonUtils.cpp b/aten/src/ATen/native/ComparisonUtils.cpp index 4019cf2ff9b1..4518a36c5aae 100644 --- a/aten/src/ATen/native/ComparisonUtils.cpp +++ b/aten/src/ATen/native/ComparisonUtils.cpp @@ -13,15 +13,24 @@ class Tensor; namespace native { template +<<<<<<< HEAD void _assert_match(const O& original, const C& compared, const std::string& name) { +======= +static void _assert_match(const O& original, const C& compared, const std::string& name) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if (compared) { bool equal = (original == compared.value()); if (!equal) { std::stringstream msg; +<<<<<<< HEAD msg << "Tensor " << name << " mismatch!"; if (!equal) { throw std::runtime_error(msg.str()); } +======= + msg << "Tensor " << name << " mismatch! Expected: " << compared.value() << ", Got: " << original; + throw std::runtime_error(msg.str()); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } } } @@ -30,7 +39,13 @@ void _assert_tensor_metadata_meta_symint(at::Tensor const& tensor, at::OptionalS _assert_match(tensor.sym_sizes(), sizes, "sizes"); _assert_match(tensor.sym_strides(), strides, "strides"); _assert_match(tensor.dtype(), dtype, "dtype"); +<<<<<<< HEAD _assert_match(tensor.device(), device, "device"); +======= + if (tensor.device().type() != DeviceType::Meta) { + _assert_match(tensor.device(), device, "device"); + } +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) _assert_match(tensor.layout(), layout, "layout"); } @@ -38,7 +53,13 @@ void _assert_tensor_metadata(at::Tensor const& tensor, at::OptionalIntArrayRef s _assert_match(tensor.sizes(), sizes, "sizes"); _assert_match(tensor.strides(), strides, "strides"); _assert_match(tensor.dtype(), dtype, "dtype"); +<<<<<<< HEAD _assert_match(tensor.device(), device, "device"); +======= + if (tensor.device().type() != DeviceType::Meta) { + _assert_match(tensor.device(), device, "device"); + } +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) _assert_match(tensor.layout(), layout, "layout"); } diff --git a/aten/src/ATen/native/ConvUtils.h b/aten/src/ATen/native/ConvUtils.h index 55b11cdfd698..8439d479cecb 100644 --- a/aten/src/ATen/native/ConvUtils.h +++ b/aten/src/ATen/native/ConvUtils.h @@ -454,4 +454,22 @@ inline bool xpu_conv_use_channels_last(const at::Tensor& input, const at::Tensor return is_channel_last(input) || is_channel_last(weight); } +<<<<<<< HEAD +======= +inline bool mps_conv_use_channels_last(const at::Tensor& input, const at::Tensor& weight) { + + // check layout only for mps tensor. + if (!input.is_mps() || !weight.is_mps()) { + return false; + } + if (!input.defined() || input.is_sparse()) { + // suggest channels_first + return false; + } + + auto fmt = input.suggest_memory_format(); + return fmt == at::MemoryFormat::ChannelsLast || fmt == at::MemoryFormat::ChannelsLast3d; +} + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } // namespace at::native diff --git a/aten/src/ATen/native/Convolution.cpp b/aten/src/ATen/native/Convolution.cpp index 162dfe56aa05..bf29a932f664 100644 --- a/aten/src/ATen/native/Convolution.cpp +++ b/aten/src/ATen/native/Convolution.cpp @@ -30,6 +30,13 @@ #include #endif +<<<<<<< HEAD +======= +#ifdef USE_MPS +#include +#endif + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #ifndef AT_PER_OPERATOR_HEADERS #include #include @@ -93,7 +100,11 @@ static bool conv_benchmark_empty_cache = true; // Check workload to activate fast depthwise FP16 cudnn conv kernels template +<<<<<<< HEAD bool check_cudnn_depthwise_workload(const at::Tensor& input, T stride) { +======= +static bool check_cudnn_depthwise_workload(const at::Tensor& input, T stride) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) auto w = at::symint::size(input, 3); // same as h auto ch = at::symint::size(input, 1); auto bs = at::symint::size(input, 0); @@ -216,7 +227,11 @@ bool check_cudnn_depthwise_workload(const at::Tensor& input, T stride) { // simplified version for cudnn 8.2 and above template +<<<<<<< HEAD bool check_cudnn_depthwise_workload_with_filter(const at::Tensor& input, T stride, const at::Tensor& weight) { +======= +static bool check_cudnn_depthwise_workload_with_filter(const at::Tensor& input, T stride, const at::Tensor& weight) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // 1D conv if(at::symint::size(input, 2) == 1 && stride == 1){ return true; @@ -442,11 +457,14 @@ struct ConvParams { } } if (cudnn_conv_suggest_memory_format(input, weight) == at::MemoryFormat::Contiguous) { +<<<<<<< HEAD // bypass dilation checks for channels_last convolution if (deterministic && is_dilated()) { // cudnn doesn't support deterministic dilated convolution fully yet return false; } +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if (is_dilated()) { return detail::getCUDAHooks().supportsDilatedConvolutionWithCuDNN() && !is_output_padding_big(); } @@ -466,6 +484,7 @@ struct ConvParams { // always use cudnn_depthwise for channels_last format return true; } +<<<<<<< HEAD if (detail::getCUDAHooks().supportsDepthwiseConvolutionWithCuDNN()) { long cudnn_version = detail::getCUDAHooks().versionCuDNN(); if (cudnn_version >= 8200) { @@ -484,10 +503,27 @@ struct ConvParams { // keep (7600 <= cudnn < 8200) code unchanged bool kernel_cond = (cudnn_version >= 7600 && use_cudnn(input, weight) && +======= + // native kernel doesn't support 64-bit non-splittable case + if (cudnn_enabled && needs_64bit_indexing_no_split(input, weight)) { + static long cudnn_version = detail::getCUDAHooks().compiledWithCuDNN() ? detail::getCUDAHooks().versionCuDNN() : -1; + if (!(cudnn_version >= 90300 && at::native::cudnnv8_enabled_check_debug())) { + TORCH_WARN_ONCE("cuDNN cannot be used for large non-batch-splittable convolutions" + " if the V8 API is not enabled or before cuDNN version 9.3+." + " Upgrade cuDNN or enable the V8 API to use cuDNN for 64-bit depthwise convolutions."); + return false; + } else { + return true; + } + } + if (detail::getCUDAHooks().supportsDepthwiseConvolutionWithCuDNN()) { + bool kernel_cond = (use_cudnn(input, weight) && +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) input.scalar_type() == kHalf && // only for FP16 weight.scalar_type() == kHalf && is_depthwise(input, weight) && input.ndimension() == 4 && // TODO: 5-D contiguous depthwise is not supported yet, need benchmarks +<<<<<<< HEAD at::symint::size(weight, 2) == at::symint::size(weight, 3) && // only square kernels at::symint::size(input, 2) >= 7 && // min width/height 7 !is_dilated() && // no dilation supported @@ -499,6 +535,15 @@ struct ConvParams { } else { return false; } +======= + !is_dilated() && // no dilation supported + (stride[0] == stride[1] || at::symint::size(input, 2) == 1) && // square or 1d + at::symint::size(input, 1) >= 32); // min 32 channels supported) + if (kernel_cond) { + return check_cudnn_depthwise_workload_with_filter(input, stride[1], weight); + } + return false; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } else { return false; } @@ -639,7 +684,11 @@ REGISTER_NO_CPU_DISPATCH(miopen_convolution_transpose_backward_stub) REGISTER_NO_CPU_DISPATCH(miopen_depthwise_convolution_backward_stub) template +<<<<<<< HEAD std::ostream& operator<<(std::ostream & out, const ConvParams& params) { +======= +static std::ostream& operator<<(std::ostream & out, const ConvParams& params) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) out << "ConvParams {" << " stride = " << IntArrayRef{params.stride} << " padding = " << ArrayRef{params.padding} @@ -1202,7 +1251,11 @@ at::Tensor convolution_overrideable( // a bool indicating whether the bias is defined. This is done to save memory by // avoiding saving the full bias tensor for backward. template +<<<<<<< HEAD ConvBackend _select_conv_backend( +======= +static ConvBackend _select_conv_backend( +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) const Tensor& input, const Tensor& weight, const std::optional& bias, @@ -1416,7 +1469,11 @@ static inline at::MemoryFormat determine_backend_memory_format( const Tensor& input, const Tensor& weight, const ConvBackend backend) { +<<<<<<< HEAD at::MemoryFormat backend_memory_format = at::MemoryFormat::Contiguous; +======= + auto backend_memory_format = at::MemoryFormat::Contiguous; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #if !defined(C10_MOBILE) auto k = weight.ndimension(); // See Note [Mobile check segfaults] @@ -1452,6 +1509,19 @@ static inline at::MemoryFormat determine_backend_memory_format( backend_memory_format = (k == 5) ? at::MemoryFormat::ChannelsLast3d : at::MemoryFormat::ChannelsLast; } break; +<<<<<<< HEAD +======= + case ConvBackend::Mps: + if (mps_conv_use_channels_last(input, weight)) { +#ifdef USE_MPS + if (!mps::is_macos_13_or_newer(mps::MacOSVersion::MACOS_VER_15_0_PLUS)) { + break; + } +#endif + backend_memory_format = (k == 5) ? MemoryFormat::ChannelsLast3d : MemoryFormat::ChannelsLast; + } + break; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) default: backend_memory_format = at::MemoryFormat::Contiguous; } diff --git a/aten/src/ATen/native/Cross.cpp b/aten/src/ATen/native/Cross.cpp index 7297aaed80d3..657ba2014d70 100644 --- a/aten/src/ATen/native/Cross.cpp +++ b/aten/src/ATen/native/Cross.cpp @@ -6,6 +6,10 @@ #include #include #include +<<<<<<< HEAD +======= +#include +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #ifndef AT_PER_OPERATOR_HEADERS @@ -77,6 +81,12 @@ Tensor & cross_out(const Tensor & input, const Tensor & other, const std::option TORCH_IMPL_FUNC(linalg_cross_out) (const Tensor & input, const Tensor & other, int64_t dim, const Tensor & out) { +<<<<<<< HEAD +======= + at::assert_no_internal_overlap(out); + at::assert_no_overlap(out, input); + at::assert_no_overlap(out, other); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) dim = maybe_wrap_dim(dim, input.dim()); auto out_size = out.sizes(); Tensor input_broadcasted = input.expand(out_size); diff --git a/aten/src/ATen/native/DispatchStub.cpp b/aten/src/ATen/native/DispatchStub.cpp index 1be4ec37dfef..396b9fe77127 100644 --- a/aten/src/ATen/native/DispatchStub.cpp +++ b/aten/src/ATen/native/DispatchStub.cpp @@ -4,6 +4,10 @@ #include #include #include +<<<<<<< HEAD +======= +#include +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #if !defined(__s390x__) && !defined(__powerpc__) #include @@ -26,6 +30,7 @@ static inline bool cpu_has_vxe() #endif static CPUCapability compute_cpu_capability() { +<<<<<<< HEAD auto envar = std::getenv("ATEN_CPU_CAPABILITY"); if (envar) { #if defined(HAVE_VSX_CPU_DEFINITION) @@ -34,14 +39,34 @@ static CPUCapability compute_cpu_capability() { } #elif defined(HAVE_ZVECTOR_CPU_DEFINITION) if (strcmp(envar, "zvector") == 0) { +======= + const auto envar = c10::utils::get_env("ATEN_CPU_CAPABILITY"); + if (envar.has_value()) { +#if defined(HAVE_VSX_CPU_DEFINITION) + if (envar == "vsx") { + return CPUCapability::VSX; + } +#elif defined(HAVE_ZVECTOR_CPU_DEFINITION) + if (envar == "zvector") { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return CPUCapability::ZVECTOR; } #elif defined(HAVE_SVE_CPU_DEFINITION) int sve_vl = cpuinfo_get_max_arm_sve_length(); //Returns maximum SVE VL supported by your HW. #ifdef HAVE_SVE256_CPU_DEFINITION +<<<<<<< HEAD if (strcmp(envar, "sve256") == 0) { if (sve_vl == 256) { return CPUCapability::SVE256; +======= + if (envar == "sve256") { + if (sve_vl == 256) { +#ifdef HAVE_ARM_BF16_CPU_DEFINITION + if (cpuinfo_has_arm_bf16()) { + return CPUCapability::SVE256; + } +#endif +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } TORCH_WARN("SVE256 capability not available on hardware. Falling back to DEFAULT"); return CPUCapability::DEFAULT; @@ -49,20 +74,35 @@ static CPUCapability compute_cpu_capability() { #endif #else #ifdef HAVE_AVX512_CPU_DEFINITION +<<<<<<< HEAD if (strcmp(envar, "avx512") == 0) { +======= + if (envar == "avx512") { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return CPUCapability::AVX512; } #endif #ifdef HAVE_AVX2_CPU_DEFINITION +<<<<<<< HEAD if (strcmp(envar, "avx2") == 0) { +======= + if (envar == "avx2") { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return CPUCapability::AVX2; } #endif #endif +<<<<<<< HEAD if (strcmp(envar, "default") == 0) { return CPUCapability::DEFAULT; } TORCH_WARN("ignoring invalid value for ATEN_CPU_CAPABILITY: ", envar); +======= + if (envar == "default") { + return CPUCapability::DEFAULT; + } + TORCH_WARN("ignoring invalid value for ATEN_CPU_CAPABILITY: ", envar.value()); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } #if !defined(__powerpc__) && !defined(__s390x__) && !defined(HAVE_SVE_CPU_DEFINITION) @@ -102,7 +142,14 @@ static CPUCapability compute_cpu_capability() { } #ifdef HAVE_SVE256_CPU_DEFINITION if (sve_vl == 256) { // Check for SVE256 +<<<<<<< HEAD return CPUCapability::SVE256; +======= + #ifdef HAVE_ARM_BF16_CPU_DEFINITION + if (cpuinfo_has_arm_bf16()) + return CPUCapability::SVE256; + #endif +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } #endif // Return the default CPU capability. @@ -147,6 +194,10 @@ DispatchResult DispatchStubImpl::try_get_call_ptr( c10::DeviceType::MPS, c10::DeviceType::MTIA, c10::DeviceType::XPU, +<<<<<<< HEAD +======= + c10::DeviceType::HPU, +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) c10::DeviceType::PrivateUse1 ); // Check if the device type is supported. @@ -203,6 +254,12 @@ DispatchResult DispatchStubImpl::try_get_call_ptr( return xpu_dispatch_ptr != nullptr ? DispatchResult(xpu_dispatch_ptr) : ErrorType::MissingDeviceKernel; #endif +<<<<<<< HEAD +======= + case DeviceType::HPU: + return hpu_dispatch_ptr != nullptr ? DispatchResult(hpu_dispatch_ptr) : ErrorType::MissingDeviceKernel; + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) case DeviceType::PrivateUse1: return privateuse1_dispatch_ptr != nullptr ? DispatchResult(privateuse1_dispatch_ptr) : ErrorType::MissingDeviceKernel; diff --git a/aten/src/ATen/native/DispatchStub.h b/aten/src/ATen/native/DispatchStub.h index 725d0d08bae1..fad49a7df94a 100644 --- a/aten/src/ATen/native/DispatchStub.h +++ b/aten/src/ATen/native/DispatchStub.h @@ -44,6 +44,10 @@ // - MPS: Apple Silicon GPUs (Metal Performance Shaders) // - MTIA: Meta Training and Inference Devices // - XPU: Intel GPUs +<<<<<<< HEAD +======= +// - HPU: Reserved for HPU (Intel Gaudi) device types +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // - PrivateUse1: Reserved for private/custom device types // // If you want to update the list of supported devices, add a new dispatch_ptr @@ -63,7 +67,11 @@ enum class CPUCapability { VSX = 1, #elif defined(HAVE_ZVECTOR_CPU_DEFINITION) ZVECTOR = 1, +<<<<<<< HEAD #elif defined(HAVE_SVE_CPU_DEFINITION) +======= +#elif defined(HAVE_SVE256_CPU_DEFINITION) && defined(HAVE_ARM_BF16_CPU_DEFINITION) +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) SVE256 = 1, #else AVX2 = 1, @@ -196,6 +204,10 @@ struct TORCH_API DispatchStubImpl { #if defined(USE_XPU) void* xpu_dispatch_ptr; #endif +<<<<<<< HEAD +======= + void* hpu_dispatch_ptr; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) void* privateuse1_dispatch_ptr; #else std::atomic cpu_dispatch_ptr{nullptr}; @@ -206,6 +218,10 @@ struct TORCH_API DispatchStubImpl { #if defined(USE_XPU) void* xpu_dispatch_ptr = nullptr; #endif +<<<<<<< HEAD +======= + void* hpu_dispatch_ptr = nullptr; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) void* privateuse1_dispatch_ptr = nullptr; #endif }; @@ -259,6 +275,13 @@ struct DispatchStub { } #endif +<<<<<<< HEAD +======= + void set_hpu_dispatch_ptr(FnPtr fn_ptr) { + impl.hpu_dispatch_ptr = reinterpret_cast(fn_ptr); + } + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) void set_hip_dispatch_ptr(FnPtr fn_ptr) { impl.hip_dispatch_ptr = reinterpret_cast(fn_ptr); } @@ -338,6 +361,16 @@ struct RegisterXPUDispatch { }; template +<<<<<<< HEAD +======= +struct RegisterHPUDispatch { + RegisterHPUDispatch(DispatchStub &stub, typename DispatchStub::FnPtr value){ + stub.set_hpu_dispatch_ptr(value); + } +}; + +template +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) struct RegisterMPSDispatch { RegisterMPSDispatch(DispatchStub &stub, typename DispatchStub::FnPtr value) { stub.set_mps_dispatch_ptr(value); @@ -437,6 +470,12 @@ struct RegisterPRIVATEUSE1Dispatch { #define REGISTER_XPU_DISPATCH(name, fn) \ static RegisterXPUDispatch name ## __register(name, fn); +<<<<<<< HEAD +======= +#define REGISTER_HPU_DISPATCH(name, fn) \ + static RegisterHPUDispatch name ## __register(name, fn); + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #define REGISTER_HIP_DISPATCH(name, fn) \ static RegisterHIPDispatch name ## __register(name, fn); diff --git a/aten/src/ATen/native/EmbeddingBag.cpp b/aten/src/ATen/native/EmbeddingBag.cpp index 336bf9364ac0..01c60a5ca7cf 100644 --- a/aten/src/ATen/native/EmbeddingBag.cpp +++ b/aten/src/ATen/native/EmbeddingBag.cpp @@ -112,11 +112,19 @@ index_select_add( const Tensor& add_indices, const Tensor& src, Tensor& output, +<<<<<<< HEAD const Tensor& /*offsets*/, bool /*include_last_offset*/, Tensor& bag_size, index_t padding_idx, _EmbeddingBagKernelCache* /* fbgemm_kernel_cache */) { +======= + [[maybe_unused]] const Tensor& offsets, + [[maybe_unused]] bool include_last_offset, + Tensor& bag_size, + index_t padding_idx, + [[maybe_unused]] _EmbeddingBagKernelCache* fbgemm_kernel_cache) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) TORCH_CHECK(select_indices.numel() == add_indices.numel()); auto* add_indices_data = add_indices.const_data_ptr(); auto* select_indices_data = select_indices.const_data_ptr(); @@ -499,11 +507,19 @@ index_select_scale_add( const Tensor& scale, const Tensor& src, Tensor& output, +<<<<<<< HEAD const Tensor& /*offsets*/, bool /*include_last_offset*/, Tensor& bag_size, index_t padding_idx, _EmbeddingBagKernelCache* /* fbgemm_kernel_cache */) { +======= + [[maybe_unused]] const Tensor& offsets, + [[maybe_unused]] bool include_last_offset, + Tensor& bag_size, + index_t padding_idx, + [[maybe_unused]] _EmbeddingBagKernelCache* fbgemm_kernel_cache) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) AT_ASSERT(select_indices.numel() == add_indices.numel()); auto* add_indices_data = add_indices.const_data_ptr(); auto* select_indices_data = select_indices.const_data_ptr(); @@ -535,9 +551,15 @@ index_select_scale_add( if (idx != padding_idx) { auto* src_base = src_data + src_stride0 * idx; auto* output_base = output_data + output_stride0 * add_indices_data[i]; +<<<<<<< HEAD auto scale = scale_data[i * scale_stride]; for (const auto j : c10::irange(ddim)) { output_base[j * output_stride1] += src_base[j * src_stride1] * scale; +======= + auto element_scale = scale_data[i * scale_stride]; + for (const auto j : c10::irange(ddim)) { + output_base[j * output_stride1] += src_base[j * src_stride1] * element_scale; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } } else if (bag_size_data) { // Decrement bag_size to reflect that the index is padded @@ -718,10 +740,17 @@ index_select_scale_add( if (idx != padding_idx) { auto* src_base = src_data + src_stride0 * idx; auto* output_base_fp32 = output_data_fp32 + ddim * add_indices_data[i]; +<<<<<<< HEAD auto scale = scale_data[i * scale_stride]; for (const auto j : c10::irange(ddim)) { output_base_fp32[j] += static_cast(src_base[j * src_stride1]) * static_cast(scale); +======= + auto element_scale = scale_data[i * scale_stride]; + for (const auto j : c10::irange(ddim)) { + output_base_fp32[j] += static_cast(src_base[j * src_stride1]) * + static_cast(element_scale); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } } else if (bag_size_data) { // Decrement bag_size to reflect that the index is padded @@ -851,9 +880,15 @@ index_select_scale_add(const Tensor &select_indices, if (idx != padding_idx) { auto* src_base = src_data + src_stride0 * idx; auto* output_base = output_data + output_stride0 * add_indices_data[i]; +<<<<<<< HEAD auto scale = scale_data[i * scale_stride]; for (const auto j : c10::irange(ddim)) { output_base[j * output_stride1] += src_base[j * src_stride1] * scale; +======= + auto element_scale = scale_data[i * scale_stride]; + for (const auto j : c10::irange(ddim)) { + output_base[j * output_stride1] += src_base[j * src_stride1] * element_scale; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } } else if (bag_size_data) { // Decrement bag_size to reflect that the index is padded @@ -941,7 +976,11 @@ void make_bag_size_out( void make_max_indices_out( Tensor& max_indices_out, const Tensor& weight, +<<<<<<< HEAD const Tensor& indices, +======= + [[maybe_unused]] const Tensor& indices, +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) const Tensor& offsets, const Tensor& bag_size, const int64_t mode, @@ -1059,13 +1098,21 @@ static Tensor apply_bag_size_backward( } template +<<<<<<< HEAD void embedding_bag_cpu_max_out( +======= +static void embedding_bag_cpu_max_out( +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) Tensor* max_indices, const Tensor& weight, const Tensor& indices, const Tensor& offset2bag, const Tensor& output, +<<<<<<< HEAD bool include_last_offset, +======= + [[maybe_unused]] bool include_last_offset, +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) Tensor& bag_size, int64_t padding_idx) { int64_t numIndices = indices.numel(); @@ -1323,9 +1370,15 @@ void _embedding_bag_cpu_out( const at::Tensor& weight, const at::Tensor& indices_, const at::Tensor& offsets_, +<<<<<<< HEAD const bool /* scale_grad_by_freq */, const int64_t mode, const bool /* sparse */, +======= + [[maybe_unused]] const bool scale_grad_by_freq, + const int64_t mode, + [[maybe_unused]] const bool sparse, +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) const std::optional& per_sample_weights, const bool include_last_offset, const std::optional& padding_idx, @@ -1505,7 +1558,11 @@ static std::vector compute_counts_uniq( } template +<<<<<<< HEAD void _embedding_bag_dense_backward_cpu_sum_mean( +======= +static void _embedding_bag_dense_backward_cpu_sum_mean( +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) const Tensor& grad, const Tensor& indices_, const Tensor& offset2bag_, @@ -1553,11 +1610,19 @@ void _embedding_bag_dense_backward_cpu_sum_mean( &counts, &grad, &index_grad_weight, &padding_idx ](index_t start, index_t end) { for (index_t i = start; i < end; i++) { +<<<<<<< HEAD index_t start = i == 0 ? 0 : next_unique_index_idx[i - 1]; index_t index = indices_data[start]; if (index != static_cast(padding_idx)) { for (index_t j = start; j < next_unique_index_idx[i]; j++) { +======= + index_t indices_start = i == 0 ? 0 : next_unique_index_idx[i - 1]; + index_t index = indices_data[indices_start]; + + if (index != static_cast(padding_idx)) { + for (index_t j = indices_start; j < next_unique_index_idx[i]; j++) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) index_t source = offset2bag_data[j]; double scale = 1.0; if (per_sample_weights) { @@ -1641,7 +1706,11 @@ Tensor _embedding_bag_dense_backward_cpu(const Tensor &grad_, const Tensor &indi } template +<<<<<<< HEAD Tensor _embedding_bag_per_sample_weights_backward_cpu_template( +======= +static Tensor _embedding_bag_per_sample_weights_backward_cpu_template( +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) const Tensor& grad, const Tensor& weight, // NB: embedding table, not per_sample_weights const Tensor& indices_, @@ -1747,7 +1816,11 @@ Tensor _embedding_bag_per_sample_weights_backward_cpu( } Tensor _embedding_bag_sparse_backward_symint( +<<<<<<< HEAD const Tensor &grad_, const Tensor &indices, const Tensor &offsets, +======= + const Tensor &grad_, const Tensor &indices, [[maybe_unused]] const Tensor &offsets, +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) const Tensor &offset2bag, const Tensor &bag_size_, SymInt num_weights, bool scale_grad_by_freq, int64_t mode, const std::optional& per_sample_weights_opt, int64_t padding_idx) { diff --git a/aten/src/ATen/native/FusedAdagrad.cpp b/aten/src/ATen/native/FusedAdagrad.cpp index 2fa6c4c877c1..0f5a66a65233 100644 --- a/aten/src/ATen/native/FusedAdagrad.cpp +++ b/aten/src/ATen/native/FusedAdagrad.cpp @@ -11,7 +11,10 @@ #include #endif +<<<<<<< HEAD +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) namespace at::native { void _fused_adagrad_kernel_cpu_( @@ -31,12 +34,17 @@ void _fused_adagrad_kernel_cpu_( const float* found_inf_ptr = found_inf.has_value() ? found_inf->data_ptr() : nullptr; if (found_inf_ptr && *found_inf_ptr == 1.0) { +<<<<<<< HEAD return; +======= + return; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } size_t n_tensors = params.size(); TORCH_CHECK(grads.size() == n_tensors); TORCH_CHECK(state_sums.size() == n_tensors); TORCH_CHECK(state_steps.size() == n_tensors); +<<<<<<< HEAD for (size_t i = 0; i < n_tensors; i++){ fused_adagrad_stub( kCPU, @@ -45,14 +53,59 @@ void _fused_adagrad_kernel_cpu_( state_sums[i], state_steps[i], lr, +======= + for (size_t i = 0; i < n_tensors; i++) { + fused_adagrad_stub( + kCPU, + params[i], + grads[i], + state_sums[i], + state_steps[i], + lr, + lr_decay, + weight_decay, + eps, + maximize, + grad_scale_ptr); + } +} + +void _fused_adagrad_kernel_cpu_( + at::TensorList params, + at::TensorList grads, + at::TensorList state_sums, + at::TensorList state_steps, + const at::Tensor& lr, + const double lr_decay, + const double weight_decay, + const double eps, + const bool maximize, + const std::optional& grad_scale, + const std::optional& found_inf) { + _fused_adagrad_kernel_cpu_( + params, + grads, + state_sums, + state_steps, + lr.item(), +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) lr_decay, weight_decay, eps, maximize, +<<<<<<< HEAD grad_scale_ptr); } +======= + grad_scale, + found_inf); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } DEFINE_DISPATCH(fused_adagrad_stub); +<<<<<<< HEAD } +======= +} // namespace at::native +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) diff --git a/aten/src/ATen/native/Linear.cpp b/aten/src/ATen/native/Linear.cpp index 5adcdc4daa4e..2e7a9aa5b7e3 100644 --- a/aten/src/ATen/native/Linear.cpp +++ b/aten/src/ATen/native/Linear.cpp @@ -5,6 +5,11 @@ #include #include #include +<<<<<<< HEAD +======= +#include +#include +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #include #include #include @@ -19,6 +24,10 @@ #include #include #include +<<<<<<< HEAD +======= +#include +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #include #include #include @@ -40,6 +49,7 @@ namespace at::native { // Parse environment variable "TORCH_LINEAR_FLATTEN_3D" static inline bool parseLinearFlatten3d() { // Uninitialized value +<<<<<<< HEAD static int value = -1; if (value == -1) { const char* env_str = std::getenv("TORCH_LINEAR_FLATTEN_3D"); @@ -50,6 +60,10 @@ static inline bool parseLinearFlatten3d() { } } return bool(value); +======= + static auto value = c10::utils::check_env("TORCH_LINEAR_FLATTEN_3D"); + return value.has_value() && value.value(); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } // `_flatten_nd_linear` flattens all but the last dimension of the input tensor @@ -98,9 +112,16 @@ Tensor linear(const Tensor& input, const Tensor& weight, const std::optionaldefined() && !input.is_xla()) { // Also hit the fused path for contiguous 3D input, if not using xla // backend. Reshaping/flattening has some performance implications on xla. +<<<<<<< HEAD if (input.is_contiguous() && input_dim == 3) { return _flatten_nd_linear(input, weight, *bias); } else if (input.is_contiguous() && input.layout() == c10::kStrided && weight.layout() == c10::kStrided && bias->dim() == 1) { +======= + bool is_contiguous = definitely_contiguous(input.sym_sizes(), input.sym_strides(), input.sym_numel()); + if (is_contiguous && input_dim == 3) { + return _flatten_nd_linear(input, weight, *bias); + } else if (is_contiguous && input.layout() == c10::kStrided && weight.layout() == c10::kStrided && bias->dim() == 1) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return _flatten_nd_linear(input, weight, *bias); } else if (parseLinearFlatten3d() && input_dim == 3) { // If user forces flattening via env var @@ -158,11 +179,19 @@ static Tensor sumproduct_pair(const Tensor& left_, const Tensor& right_, IntArra Tensor left = left_; Tensor right = right_; for (const auto i : c10::irange(dim)) { +<<<<<<< HEAD auto sl = left.sym_size(i)!=1; auto sr = right.sym_size(i)!=1; if (sum_dims[i]) { // first dimensions that will be summed over after multiplication if (sl && sr) { // dimensions nontrivially in both left and right must be of the same size TORCH_CHECK(left.sym_size(i)==right.sym_size(i), "non-broadcast dimensions must match"); +======= + auto sl = TORCH_GUARD_SIZE_OBLIVIOUS(left.sym_size(i).sym_ne(1)); + auto sr = TORCH_GUARD_SIZE_OBLIVIOUS(right.sym_size(i).sym_ne(1)); + if (sum_dims[i]) { // first dimensions that will be summed over after multiplication + if (sl && sr) { // dimensions nontrivially in both left and right must be of the same size + TORCH_SYM_CHECK(left.sym_size(i).sym_eq(right.sym_size(i)), "non-broadcast dimensions must match"); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) sum_size *= left.sym_size(i); } else if (sl) { // if it is only in one of left and right, we can sum right away left = left.sum(i, true); @@ -171,7 +200,11 @@ static Tensor sumproduct_pair(const Tensor& left_, const Tensor& right_, IntArra } } else if (sl && sr) { // now deal with dimensions that will be in the output // dimensions nontrivially in both left and right must be of the same size +<<<<<<< HEAD TORCH_CHECK(left.sym_size(i)==right.sym_size(i), "non-broadcast dimensions must match"); +======= + TORCH_SYM_CHECK(left.sym_size(i).sym_eq(right.sym_size(i)), "non-broadcast dimensions must match"); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) lro.push_back(i); lro_size *= left.sym_size(i); } else if (sl) { // keep track of dimensions appearing only once @@ -481,10 +514,17 @@ Tensor einsum(std::string_view equation, TensorList operands, at::OptionalIntArr // Iterate over each dimension covered by ellipsis const auto ndim = operands[i].ndimension() - (static_cast(op_labels[i].size()) - 1); for (auto j = ell_num_dim - ndim; j < ell_num_dim; ++j) { +<<<<<<< HEAD if (op.sym_size(dim) != 1) { // Update ellipsis size TORCH_CHECK( ell_sizes[j] == 1 || ell_sizes[j] == op.sym_size(dim), +======= + if (TORCH_GUARD_SIZE_OBLIVIOUS(op.sym_size(dim).sym_ne(1))) { + // Update ellipsis size + TORCH_SYM_CHECK( + ell_sizes[j].sym_eq(1).sym_or(ell_sizes[j].sym_eq(op.sym_size(dim))), +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) "einsum(): dimension ", dim, " covered by ellipsis in operand ", @@ -500,10 +540,17 @@ Tensor einsum(std::string_view equation, TensorList operands, at::OptionalIntArr permutation[ell_index + j] = dim++; } } else if (permutation[label_perm_index[s]] == -1) { +<<<<<<< HEAD if (op.sym_size(dim) != 1) { // Update subscript TORCH_CHECK( label_size[s] == 1 || label_size[s] == op.sym_size(dim), +======= + if (TORCH_GUARD_SIZE_OBLIVIOUS(op.sym_size(dim).sym_ne(1))) { + // Update subscript + TORCH_SYM_CHECK( + label_size[s].sym_eq(1).sym_or(label_size[s].sym_eq(op.sym_size(dim))), +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) "einsum(): subscript ", subscript_to_label(s), " has size ", @@ -578,16 +625,28 @@ Tensor einsum(std::string_view equation, TensorList operands, at::OptionalIntArr SmallVector a_dims_to_sum; SmallVector b_dims_to_sum; for (auto dim = out_num_dim; dim < perm_index; ++dim) { +<<<<<<< HEAD if (a.sym_size(dim) != 1 && b.sym_size(dim) != 1) { +======= + if (TORCH_GUARD_SIZE_OBLIVIOUS(a.sym_size(dim).sym_ne(1)) + && TORCH_GUARD_SIZE_OBLIVIOUS(b.sym_size(dim).sym_ne(1))) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if (--dim_counts[dim] == 1) { sum_dims.push_back(dim); dim_counts[dim] = 0; } } else if (dim_counts[dim] == 1) { +<<<<<<< HEAD if (a.sym_size(dim) != 1) { a_dims_to_sum.push_back(dim); dim_counts[dim] = 0; } else if (b.sym_size(dim) != 1) { +======= + if (TORCH_GUARD_SIZE_OBLIVIOUS(a.sym_size(dim).sym_ne(1))) { + a_dims_to_sum.push_back(dim); + dim_counts[dim] = 0; + } else if (TORCH_GUARD_SIZE_OBLIVIOUS(b.sym_size(dim).sym_ne(1))) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) b_dims_to_sum.push_back(dim); dim_counts[dim] = 0; } @@ -817,11 +876,43 @@ Tensor tensordot(const Tensor& input1, const Tensor& input2, IntArrayRef dims1, rsizes.emplace_back(t2.sym_size(i)); } } +<<<<<<< HEAD // permute and reshape for matrix multiplication t1 = t1.permute(p1).reshape_symint({size1, csize}); t2 = t2.permute(p2).reshape_symint({csize, size2}); // multiply and reshape to target size return at::mm(t1, t2).reshape_symint(rsizes); +======= + + // Full contraction (size1 == 1 and size2 == 1) is much faster when done with dot ... + // TODO(@nikitaved): there are other cases where dot outperforms gemms, + // like, for example, when the non-contracted dims are relatively small. + // NOTE(@nikitaved): contract with gemm when on MPS, + // otherwise issues with the tests xpassing/xfailing + // when enabling the fast-path with dot. + // TODO: resolve that + if ((t1.device().type() == at::kMPS || t2.device().type() == at::kMPS) || size1 != 1 || size2 != 1) { + // permute and reshape for matrix multiplication + t1 = t1.permute(p1).reshape_symint({size1, csize}); + t2 = t2.permute(p2).reshape_symint({csize, size2}); + // multiply and reshape to target size + return at::mm(t1, t2).reshape_symint(rsizes); + } else { + // permute to align for contraction + t1 = t1.permute(p1); + t2 = t2.permute(p2); + + if (t1.is_contiguous() && t2.is_contiguous()) { + // If t1 and t2 are both contiguous, then flatten is a view, + // then dot is the method of choice + return at::dot(t1.flatten(), t2.flatten()).reshape_symint(rsizes); + } else { + // Otherwise mul + sum can be faster as it avoids at most 2x contiguous() calls + // NOTE: t1.dtype == t2.dtype -- check above + return (t1.squeeze() * t2.squeeze()).sum(t1.scalar_type()).reshape_symint(rsizes); + } + } +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } Tensor &tensordot_out(const Tensor& input1, const Tensor& input2, IntArrayRef dims1, IntArrayRef dims2, Tensor& result) { @@ -831,6 +922,17 @@ Tensor &tensordot_out(const Tensor& input1, const Tensor& input2, IntArrayRef di auto output_device = result.device(); auto input1_device = input1.device(); auto input2_device = input2.device(); +<<<<<<< HEAD +======= + + if(result.defined()) { + TORCH_CHECK( + !(result.requires_grad() && at::GradMode::is_enabled() && result.sizes() != result_tmp.sizes()), + "tensordot(): the 'out' tensor was specified and requires gradients, and its shape does not match the expected result. " + "Either remove the 'out' argument, ensure it does not require gradients, or make sure its shape matches the expected output." + ); + } +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // check if the input & output tensors are on the same device. TORCH_CHECK( (output_device == input1_device) && (input1_device == input2_device), diff --git a/aten/src/ATen/native/LinearAlgebra.cpp b/aten/src/ATen/native/LinearAlgebra.cpp index 1cfff77eb592..6e5db949d5f1 100644 --- a/aten/src/ATen/native/LinearAlgebra.cpp +++ b/aten/src/ATen/native/LinearAlgebra.cpp @@ -23,6 +23,10 @@ #include #include #include +<<<<<<< HEAD +======= +#include +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #include #include @@ -285,7 +289,11 @@ TORCH_META_FUNC(_linalg_slogdet)(const Tensor& A) { } template +<<<<<<< HEAD void common_checks_baddbmm_bmm(Meta& meta, const Tensor& batch1, const Tensor& batch2, const Scalar& beta, const Scalar& alpha, bool is_bmm, const std::optional& self_baddbmm = std::nullopt) { +======= +static void common_checks_baddbmm_bmm(Meta& meta, const Tensor& batch1, const Tensor& batch2, const Scalar& beta, const Scalar& alpha, bool is_bmm, const std::optional& self_baddbmm = std::nullopt) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) TORCH_CHECK(batch1.dim() == 3, "batch1 must be a 3D tensor"); TORCH_CHECK(batch2.dim() == 3, "batch2 must be a 3D tensor"); @@ -1366,8 +1374,13 @@ static inline int64_t get_mkldnn_matmul_min_dim() { //it's enabled on all Neoverse cpus. return is_arm_neoverse() ? 8 : 0; }(); +<<<<<<< HEAD const char* ptr = std::getenv("TORCH_MKLDNN_MATMUL_MIN_DIM"); return ptr != nullptr ? std::atoi(ptr) : default_min_dim; +======= + const auto value = c10::utils::get_env("TORCH_MKLDNN_MATMUL_MIN_DIM"); + return value.has_value() ? std::stoi(value.value()) : default_min_dim; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) }(); return value; } @@ -1380,8 +1393,13 @@ static inline int64_t get_mkldnn_matmul_min_size() { // it's enabled on all Neoverse cpus. return is_arm_neoverse() ? 8 * 1024 : 0; }(); +<<<<<<< HEAD const char* ptr = std::getenv("TORCH_MKLDNN_MATMUL_MIN_SIZE"); return ptr != nullptr ? std::atoi(ptr) : default_min_size; +======= + const auto value = c10::utils::get_env("TORCH_MKLDNN_MATMUL_MIN_SIZE"); + return value.has_value() ? std::stoi(value.value()) : default_min_size; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) }(); return value; } @@ -1639,7 +1657,11 @@ TORCH_IMPL_FUNC(mm_out_cpu)(const Tensor & self, const Tensor & mat2, const Tens } template +<<<<<<< HEAD inline void baddbmm_cpu_kernel(const Tensor& result, const Tensor& self, const Tensor& mat2, const Scalar& beta_, const Scalar& alpha_) { +======= +static inline void baddbmm_cpu_kernel(const Tensor& result, const Tensor& self, const Tensor& mat2, const Scalar& beta_, const Scalar& alpha_) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) int64_t bs = result.size(0); int64_t is = result.size(1); int64_t js = result.size(2); @@ -2652,7 +2674,11 @@ Tensor mexp_impl( // `norm_cpu` is used to decide which Tensors require which approximation // based on their norm. This decision takes place on CPU. // It requires moving data back and forth between devices when `a` is on CUDA, +<<<<<<< HEAD // but at the cost of only one sigle CPU-CUDA synchronization (instead of 6), +======= + // but at the cost of only one single CPU-CUDA synchronization (instead of 6), +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // and better performance overall (benchmarked). const auto norm_cpu = (a.device().type() == at::kCUDA) ? norm.to(at::kCPU) : norm; diff --git a/aten/src/ATen/native/LossCTC.cpp b/aten/src/ATen/native/LossCTC.cpp index 1513e756c71d..d6620850b996 100644 --- a/aten/src/ATen/native/LossCTC.cpp +++ b/aten/src/ATen/native/LossCTC.cpp @@ -126,6 +126,10 @@ std::tuple> ctc_loss_allocate_outpu // the alphas from the user by only returning the loss. template std::tuple ctc_loss_cpu_template(const Tensor& log_probs, const Tensor& targets, IntArrayRef input_lengths, IntArrayRef target_lengths, int64_t BLANK) { +<<<<<<< HEAD +======= + TORCH_CHECK(log_probs.numel() > 0, "log_probs tensor must not be empty"); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // log_probs: input_len x batch_size x num_labels // targets [int64]: batch_size x target_length OR sum(target_lengths) constexpr scalar_t neginf = -std::numeric_limits::infinity(); diff --git a/aten/src/ATen/native/Math.h b/aten/src/ATen/native/Math.h index 47c0a2be0303..36bc8b67c8c1 100644 --- a/aten/src/ATen/native/Math.h +++ b/aten/src/ATen/native/Math.h @@ -1680,7 +1680,11 @@ inline C10_HOST_DEVICE T calc_ndtri(T y0) { return x; } +<<<<<<< HEAD /* The next function is taken from http://ab-initio.mit.edu/Faddeev */ +======= +/* The next function is taken from http://ab-initio.mit.edu/faddeeva */ +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) /* Copyright (c) 2012 Massachusetts Institute of Technology * diff --git a/aten/src/ATen/native/MathBitsFallback.h b/aten/src/ATen/native/MathBitsFallback.h index de2296634e04..56cf21137c9b 100644 --- a/aten/src/ATen/native/MathBitsFallback.h +++ b/aten/src/ATen/native/MathBitsFallback.h @@ -22,7 +22,11 @@ namespace at::native { // NOTE: To use this fallback, `clone` and `copy_` should fully understand and be able to correctly handle the semantic of your math bit. struct MathOpFallback { +<<<<<<< HEAD MathOpFallback(DispatchKey key_, string op_name_) : key(key_), op_name(std::move(op_name_)) {} +======= + MathOpFallback(DispatchKey key_, std::string op_name_) : key(key_), op_name(std::move(op_name_)) {} +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) virtual bool is_bit_set(const Tensor&) = 0; void fallback_impl(const c10::OperatorHandle& op, DispatchKeySet dispatch_keys, torch::jit::Stack* stack) { /* @@ -151,7 +155,11 @@ struct MathOpFallback { virtual ~MathOpFallback() = default; DispatchKey key; +<<<<<<< HEAD string op_name; +======= + std::string op_name; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) }; } // namespace at::native diff --git a/aten/src/ATen/native/NNPACK.cpp b/aten/src/ATen/native/NNPACK.cpp index 9a5ae286666c..3ee3c9581b08 100644 --- a/aten/src/ATen/native/NNPACK.cpp +++ b/aten/src/ATen/native/NNPACK.cpp @@ -25,8 +25,12 @@ at::Tensor _nnpack_spatial_convolution( const Tensor& weight, const std::optional& bias_opt, const IntArrayRef padding, const IntArrayRef stride) { +<<<<<<< HEAD throw std::runtime_error( "nnpack_spatial_convolution: ATen not compiled with NNPACK support"); +======= + TORCH_CHECK(false, "nnpack_spatial_convolution: ATen not compiled with NNPACK support"); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } bool _nnpack_available() { @@ -143,6 +147,7 @@ Tensor _nnpack_spatial_convolution( input.options()); // Our input Tensor must be in the form N,C,H,W +<<<<<<< HEAD if (input.ndimension() != 4) { throw std::runtime_error( "NNPack convolutionOutput expects 4D input Tensor N,C,H,W"); @@ -180,14 +185,58 @@ Tensor _nnpack_spatial_convolution( << ") in NNPack convolutionOutput"; throw std::runtime_error(err.str()); } +======= + TORCH_CHECK( + input.ndimension() == 4, + "NNPack convolutionOutput expects 4D input Tensor N,C,H,W"); + + // Our weight Tensor must be in the form oC,iC,kH,kW + TORCH_CHECK( + weight.ndimension() == 4, + "NNPack convolutionOutput expects 4D weight Tensor oC,iC,kH,kW"); + + // Our output Tensor must be in the form N,oC,oH,oW + TORCH_CHECK( + output.ndimension() == 4, + "NNPack convolutionOutput expects 4D output Tensor N,oC,oH,oW"); + + // Some basic shape checking, not comprehensive + TORCH_CHECK( + input.size(1) == weight.size(1), + "Mismatch between number of input channels in input Tensor (", + input.size(1), + ") and weight Tensor (", + weight.size(1), + ") in NNPack convolutionOutput"); + + TORCH_CHECK( + weight.size(0) == output.size(1), + "Mismatch between number of output channels in weight Tensor (", + weight.size(0), + ") and output Tensor (", + output.size(1), + ") in NNPack convolutionOutput"); + + TORCH_CHECK( + input.size(0) == output.size(0), + "Mismatch between batch size in input Tensor (", + input.size(0), + ") and output Tensor (", + output.size(0), + ") in NNPack convolutionOutput"); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // All Tensors must be float Tensors if (input.device().type() != kCPU || input.scalar_type() != kFloat || weight.device().type() != kCPU || weight.scalar_type() != kFloat || output.device().type() != kCPU || output.scalar_type() != kFloat || (bias.defined() && (bias.device().type() != kCPU || bias.scalar_type() != kFloat))) { +<<<<<<< HEAD throw std::runtime_error( "Mismatched Tensor types in NNPack convolutionOutput"); +======= + TORCH_CHECK(false, "Mismatched Tensor types in NNPack convolutionOutput"); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } const auto algorithm = nnp_convolution_algorithm_auto; @@ -281,9 +330,15 @@ Tensor _nnpack_spatial_convolution( auto size_and_allocate_ws = [&]() { // Run a single pass to get the size of memory workspace buffer const auto status = compute(batch_size); +<<<<<<< HEAD if (status != nnp_status_success) { throw std::runtime_error("NNPACK SpatialConvolution_updateOutput failed"); } +======= + TORCH_CHECK( + status == nnp_status_success, + "NNPACK SpatialConvolution_updateOutput failed"); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) workspace.allocate(); }; @@ -304,9 +359,15 @@ Tensor _nnpack_spatial_convolution( status = compute(batch_size); } +<<<<<<< HEAD if (status != nnp_status_success) { throw std::runtime_error("NNPACK SpatialConvolution_updateOutput failed"); } +======= + TORCH_CHECK( + status == nnp_status_success, + "NNPACK SpatialConvolution_updateOutput failed"); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return output; } diff --git a/aten/src/ATen/native/NaiveConvolutionTranspose3d.cpp b/aten/src/ATen/native/NaiveConvolutionTranspose3d.cpp index cb9f3c469349..7973c01fe38f 100644 --- a/aten/src/ATen/native/NaiveConvolutionTranspose3d.cpp +++ b/aten/src/ATen/native/NaiveConvolutionTranspose3d.cpp @@ -20,9 +20,12 @@ namespace at::native { +<<<<<<< HEAD template void gemv(char trans, int64_t m, int64_t n, scalar_t alpha, scalar_t *a, int64_t lda, scalar_t *x, int64_t incx, scalar_t beta, scalar_t *y, int64_t incy); +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) namespace { static inline void slow_conv_transpose3d_shape_check( @@ -299,7 +302,11 @@ void slow_conv_transpose3d_out_cpu_template( int64_t elt; // For each elt in batch, do: for (elt = 0; elt < batch_size; ++elt) { +<<<<<<< HEAD // Matrix mulitply per output: +======= + // Matrix multiply per output: +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) input_n = input.select(0, elt); output_n = output.select(0, elt); @@ -523,7 +530,11 @@ void slow_conv_transpose3d_backward_out_cpu_template( int64_t elt; // For each elt in batch, do: for (elt = 0; elt < batch_size; ++elt) { +<<<<<<< HEAD // Matrix mulitply per sample: +======= + // Matrix multiply per sample: +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) grad_input_n = grad_input.select(0, elt); grad_output_n = grad_output.select(0, elt); @@ -739,12 +750,20 @@ void slow_conv_transpose3d_acc_grad_parameters_cpu( int64_t elt; // For each elt in batch, do: for (elt = 0; elt < batch_size; ++elt) { +<<<<<<< HEAD // Matrix mulitply per output: +======= + // Matrix multiply per output: +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) grad_output_n = grad_output.select(0, elt); // Do Weight: if (grad_weight.defined()) { +<<<<<<< HEAD // Matrix mulitply per output: +======= + // Matrix multiply per output: +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) input_n = input.select(0, elt); if (need_columns) { diff --git a/aten/src/ATen/native/Normalization.cpp b/aten/src/ATen/native/Normalization.cpp index ba7d9601fad5..ddd0dbb4b36f 100644 --- a/aten/src/ATen/native/Normalization.cpp +++ b/aten/src/ATen/native/Normalization.cpp @@ -61,7 +61,10 @@ #include #include #include +<<<<<<< HEAD #include +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) static const int MIOPEN_DIM_MAX = 5; @@ -133,7 +136,11 @@ static inline MemoryFormat suggest_memory_format_contig(const Tensor& t) { } template +<<<<<<< HEAD std::tuple batch_norm_cpu_transform_input_template( +======= +static std::tuple batch_norm_cpu_transform_input_template( +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) const Tensor& input, const Tensor& weight, const Tensor& bias, const Tensor& save_mean /* optional */, const Tensor& save_invstd /* optional */, const Tensor& running_mean /* optional */, const Tensor& running_var /* optional */, @@ -198,7 +205,11 @@ std::tuple batch_norm_cpu_transform_input_template( } template class VarTransform> +<<<<<<< HEAD std::tuple batch_norm_cpu_update_stats_template( +======= +static std::tuple batch_norm_cpu_update_stats_template( +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) const Tensor& input, const Tensor& running_mean, const Tensor& running_var, double momentum, double eps, Tensor& save_mean, Tensor& save_var_transform) { @@ -288,7 +299,11 @@ std::tuple batch_norm_cpu_update_stats_template( } template class VarTransform> +<<<<<<< HEAD std::tuple batch_norm_cpu_update_stats_template( +======= +static std::tuple batch_norm_cpu_update_stats_template( +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) const Tensor& input, const Tensor& running_mean, const Tensor& running_var, double momentum, double eps) { int64_t n_input = input.size(1); @@ -307,7 +322,11 @@ std::tuple batch_norm_cpu_update_stats_template( } template +<<<<<<< HEAD std::tuple batch_norm_backward_cpu_template( +======= +static std::tuple batch_norm_backward_cpu_template( +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) const Tensor& grad_out_, const Tensor& input, const Tensor& weight, const Tensor& running_mean, const Tensor& running_var, const Tensor& save_mean, const Tensor& save_invstd, bool train, double eps, std::array grad_input_mask) { @@ -528,6 +547,7 @@ BatchNormBackend _select_batch_norm_backend( bool PYTORCH_MIOPEN_SUGGEST_NHWC_BATCHNORM = c10::utils::check_env("PYTORCH_MIOPEN_SUGGEST_NHWC_BATCHNORM").value_or(ROCM_VERSION >= 70000); if ( +<<<<<<< HEAD input.is_cuda() && (input.dim() <= MIOPEN_DIM_MAX) && (input.scalar_type() != at::kDouble) @@ -541,6 +561,22 @@ BatchNormBackend _select_batch_norm_backend( && (input.dim() >= 3) && detail::getCUDAHooks().compiledWithMIOpen() && cudnn_enabled +======= + detail::getCUDAHooks().compiledWithMIOpen() + && cudnn_enabled + && input.is_cuda() + && input.dim() <= MIOPEN_DIM_MAX + && input.dim() >= 3 + && input.scalar_type() != at::kDouble +#if (defined(USE_ROCM) && ROCM_VERSION < 60400) + && (input.scalar_type() != at::kBFloat16) +#endif + && (detail::getCUDAHooks().versionMIOpen() >= 30400 || input.scalar_type() != at::kBFloat16) + && weight.scalar_type() == at::kFloat // only FP32 weight for FP32 or FP16/BF16(mixed) input + && weight.defined() && bias.defined() + && ((running_mean.defined() && running_var.defined()) + || (!running_mean.defined() && !running_var.defined() && training)) +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) && (input.suggest_memory_format() == MemoryFormat::Contiguous #if (defined(USE_ROCM) && ROCM_VERSION >= 60500) || (input.suggest_memory_format() == MemoryFormat::ChannelsLast && PYTORCH_MIOPEN_SUGGEST_NHWC_BATCHNORM) @@ -554,7 +590,10 @@ BatchNormBackend _select_batch_norm_backend( return BatchNormBackend::Native; } +<<<<<<< HEAD bool PYTORCH_MIOPEN_EXTRA_LOGGING = c10::utils::check_env("PYTORCH_MIOPEN_EXTRA_LOGGING").value_or(false); +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // _batch_norm_impl_index(_backward) are used in the JIT be able to keep the run-time selection // of backends, while enabling it to keep the information about the used backend, so that it can @@ -565,6 +604,7 @@ std::tuple _batch_norm_impl_index( const Tensor& input, const std::optional& weight_opt /* optional */, const std::optional& bias_opt /* optional */, const std::optional& running_mean_opt /* optional */, const std::optional& running_var_opt /* optional */, bool training, double momentum, double eps, bool cudnn_enabled) { // See [Note: hacky wrapper removal for optional tensor] +<<<<<<< HEAD if (PYTORCH_MIOPEN_EXTRA_LOGGING) std :: cout << "PYTORCH_MIOPEN_EXTRA_LOGGING: ********************* _batch_norm_impl_index" @@ -579,6 +619,8 @@ std::tuple _batch_norm_impl_index( << " cudnn_enabled=" << cudnn_enabled << std::endl; +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) c10::MaybeOwned weight_maybe_owned = at::borrow_from_optional_tensor(weight_opt); const Tensor& weight = *weight_maybe_owned; const Tensor& bias = bias_opt.value_or(Tensor()); @@ -638,6 +680,7 @@ std::tuple _batch_norm_impl_index( Tensor reserve = at::empty({0}, input.options().dtype(kByte)); +<<<<<<< HEAD if (PYTORCH_MIOPEN_EXTRA_LOGGING) std::cout << "PYTORCH_MIOPEN_EXTRA_LOGGING: ********************* _batch_norm_impl_index (use_miopen)" @@ -656,6 +699,9 @@ std::tuple _batch_norm_impl_index( if (backend == BatchNormBackend::Miopen) { if (PYTORCH_MIOPEN_EXTRA_LOGGING) std::cout << "PYTORCH_MIOPEN_EXTRA_LOGGING: ********************* _batch_norm_impl_index (calling miopen_batch_norm)" << std::endl; +======= + if (backend == BatchNormBackend::Miopen) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return std::tuple_cat( at::miopen_batch_norm( input.contiguous(input.suggest_memory_format()), weight.contiguous(), bias.contiguous(), @@ -678,8 +724,11 @@ std::tuple _batch_norm_impl_index_backward( const Tensor& input, const Tensor& grad_output, const std::optional& weight_opt /* optional */, const std::optional& running_mean_opt /* optional */, const std::optional& running_var_opt /* optional */, const std::optional& save_mean_opt /* optional */, const std::optional& save_var_transform_opt /* optional */, bool train, double epsilon, std::array output_mask, const Tensor &reservedSpace) { // See [Note: hacky wrapper removal for optional tensor] +<<<<<<< HEAD if (PYTORCH_MIOPEN_EXTRA_LOGGING) std :: cout << "PYTORCH_MIOPEN_EXTRA_LOGGING: ********************* _batch_norm_impl_index_backward" << std::endl; +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) c10::MaybeOwned weight_maybe_owned = at::borrow_from_optional_tensor(weight_opt); const Tensor& weight = *weight_maybe_owned; const Tensor& running_mean = running_mean_opt.value_or(Tensor()); @@ -710,16 +759,22 @@ std::tuple _batch_norm_impl_index_backward( // backward in inference mode is not supported in cudnn, fallback to native if (impl_index == 0 || (!train)) { +<<<<<<< HEAD if (PYTORCH_MIOPEN_EXTRA_LOGGING) std :: cout << "PYTORCH_MIOPEN_EXTRA_LOGGING: ********************* _batch_norm_impl_index_backward (calling native_batch_norm_backward)" << std::endl; +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return at::native_batch_norm_backward(grad_output, input, weight, running_mean, running_var, save_mean, save_var_transform, train, epsilon, output_mask); } else if (impl_index == 1) { // TODO: _batch_norm_impl_index_backward is only used in JIT. cudnn NHWC // format conversion is done inside cudnn_batch_norm_backward instead return at::cudnn_batch_norm_backward(input, grad_output, weight, running_mean, running_var, save_mean, save_var_transform, epsilon, reservedSpace); } else if (impl_index == 2) { +<<<<<<< HEAD if (PYTORCH_MIOPEN_EXTRA_LOGGING) std :: cout << "PYTORCH_MIOPEN_EXTRA_LOGGING: ********************* _batch_norm_impl_index_backward (calling miopen_batch_norm_backward)" << std::endl; +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return at::miopen_batch_norm_backward(input, grad_output, weight, running_mean, running_var, save_mean, save_var_transform, epsilon); } TORCH_INTERNAL_ASSERT(false, "Unsupported impl_index in _batch_norm_impl_index_backward: ", impl_index); @@ -730,6 +785,7 @@ Tensor batch_norm( const Tensor& input, const std::optional& weight_opt, const std::optional& bias_opt, const std::optional& running_mean_opt, const std::optional& running_var_opt, bool training, double momentum, double eps, bool cudnn_enabled) { +<<<<<<< HEAD if (PYTORCH_MIOPEN_EXTRA_LOGGING) std :: cout << "PYTORCH_MIOPEN_EXTRA_LOGGING: ********************* batch_norm" @@ -744,6 +800,8 @@ Tensor batch_norm( << " cudnn_enabled=" << cudnn_enabled << std::endl; +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) const Tensor& weight = weight_opt.value_or(Tensor()); const Tensor& bias = bias_opt.value_or(Tensor()); const Tensor& running_mean = running_mean_opt.value_or(Tensor()); @@ -835,6 +893,14 @@ std::tuple batch_norm_update_stats_cpu( std::tuple batch_norm_cpu_out(const Tensor& self, const std::optional& weight_opt, const std::optional& bias_opt, const std::optional& running_mean_opt, const std::optional& running_var_opt, bool train, double momentum, double eps, Tensor& out, Tensor& save_mean, Tensor& save_var) { +<<<<<<< HEAD +======= + const bool has_running_mean = (running_mean_opt.has_value() && running_mean_opt->defined()); + const bool has_running_var = (running_var_opt.has_value() && running_var_opt->defined()); + TORCH_CHECK_VALUE(has_running_mean == has_running_var, + "running_mean and running_var must either both be None or neither be None"); + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // See [Note: hacky wrapper removal for optional tensor] c10::MaybeOwned weight_maybe_owned = at::borrow_from_optional_tensor(weight_opt); const Tensor& weight = *weight_maybe_owned; diff --git a/aten/src/ATen/native/RNN.cpp b/aten/src/ATen/native/RNN.cpp index e7e8a49b452f..195065b13d58 100644 --- a/aten/src/ATen/native/RNN.cpp +++ b/aten/src/ATen/native/RNN.cpp @@ -880,7 +880,11 @@ struct FullBidirectionalLayer step_inputs = input_w.unbind(0); auto fw_result = layer_( step_inputs, input_hidden.first, params.first, true); +<<<<<<< HEAD TORCH_CHECK(fw_result.outputs.size() > 0, "Expected sequence length to be larger than 0 in RNN"); +======= + TORCH_CHECK(!fw_result.outputs.empty(), "Expected sequence length to be larger than 0 in RNN"); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) auto fw_output = at::stack(fw_result.outputs, 0); input_w = params.second.linear_ih(input); step_inputs = input_w.unbind(0); @@ -895,7 +899,11 @@ struct FullBidirectionalLayer step_inputs = input.unbind(0); auto fw_result = layer_(step_inputs, input_hidden.first, params.first); +<<<<<<< HEAD TORCH_CHECK(fw_result.outputs.size() > 0, "Expected sequence length to be larger than 0 in RNN"); +======= + TORCH_CHECK(!fw_result.outputs.empty(), "Expected sequence length to be larger than 0 in RNN"); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) auto fw_output = at::stack(fw_result.outputs, 0); auto rev_step_inputs = reverse(std::move(step_inputs)); auto rev_result = diff --git a/aten/src/ATen/native/RangeFactories.cpp b/aten/src/ATen/native/RangeFactories.cpp index 5ecc0f159331..3703d10b2486 100644 --- a/aten/src/ATen/native/RangeFactories.cpp +++ b/aten/src/ATen/native/RangeFactories.cpp @@ -157,12 +157,17 @@ Tensor& range_out(const Scalar& start, const Scalar& end, const Scalar& step, Te auto xend = end.to(); auto xstep = step.to(); +<<<<<<< HEAD TORCH_CHECK(xstep > 0 || xstep < 0, "step must be nonzero"); TORCH_CHECK(std::isfinite(static_cast(xstart)) && std::isfinite(static_cast(xend)), "unsupported range: ", xstart, " -> ", xend); TORCH_CHECK(((xstep > 0) && (xend >= xstart)) || ((xstep < 0) && (xend <= xstart)), "upper bound and lower bound inconsistent with step sign"); +======= + arange_check_bounds(start, end, step); + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) int64_t size = static_cast(((xend - xstart) / xstep) + 1); if (result.numel() != size) { result.resize_({size}); diff --git a/aten/src/ATen/native/RangeUtils.h b/aten/src/ATen/native/RangeUtils.h index d1756db75016..94645ff462c9 100644 --- a/aten/src/ATen/native/RangeUtils.h +++ b/aten/src/ATen/native/RangeUtils.h @@ -2,6 +2,7 @@ #include #include +<<<<<<< HEAD namespace at { namespace native { @@ -19,6 +20,36 @@ int64_t compute_arange_size(const Scalar& start, const Scalar& end, const Scalar "unsupported range: ", xstart, " -> ", xend); TORCH_CHECK(((xstep > 0) && (xend >= xstart)) || ((xstep < 0) && (xend <= xstart)), "upper bound and larger bound inconsistent with step sign"); +======= + + +namespace at::native { + +inline void arange_check_bounds( + const c10::Scalar& start, + const c10::Scalar& end, + const c10::Scalar& step) { + // use double precision for validation to avoid precision issues + double dstart = start.to(); + double dend = end.to(); + double dstep = step.to(); + + TORCH_CHECK(dstep > 0 || dstep < 0, "step must be nonzero"); + TORCH_CHECK( + std::isfinite(dstart) && std::isfinite(dend), + "unsupported range: ", + dstart, + " -> ", + dend); + TORCH_CHECK( + ((dstep > 0) && (dend >= dstart)) || ((dstep < 0) && (dend <= dstart)), + "upper bound and lower bound inconsistent with step sign"); +} + +template +int64_t compute_arange_size(const Scalar& start, const Scalar& end, const Scalar& step) { + arange_check_bounds(start, end, step); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // we use double precision for (start - end) / step // to compute size_d for consistency across devices. @@ -29,6 +60,13 @@ int64_t compute_arange_size(const Scalar& start, const Scalar& end, const Scalar // the corner-case we do want to take into account is int64_t, which has higher precision than double double size_d; if constexpr (std::is_same_v) { +<<<<<<< HEAD +======= + using accscalar_t = at::acc_type; + auto xstart = start.to(); + auto xend = end.to(); + auto xstep = step.to(); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) int64_t sgn = (xstep > 0) - (xstep < 0); size_d = std::ceil((xend - xstart + xstep - sgn) / xstep); } else { @@ -42,4 +80,8 @@ int64_t compute_arange_size(const Scalar& start, const Scalar& end, const Scalar return static_cast(size_d); } +<<<<<<< HEAD }} // namespace at::native +======= +} // namespace at::native +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) diff --git a/aten/src/ATen/native/ReduceOps.cpp b/aten/src/ATen/native/ReduceOps.cpp index e5778411870c..53b88ff19b9e 100644 --- a/aten/src/ATen/native/ReduceOps.cpp +++ b/aten/src/ATen/native/ReduceOps.cpp @@ -472,7 +472,11 @@ Tensor& logcumsumexp_out(const Tensor& self, int64_t dim, Tensor& result) { } template +<<<<<<< HEAD void impl_func_cum_ops( +======= +static void impl_func_cum_ops( +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) const Tensor& self, int64_t dim, const Tensor& result, @@ -769,7 +773,11 @@ inline bool isnan_(T x) { } template +<<<<<<< HEAD void cummax_cummin_helper(const T1* self_data, T1* values_data, T2* indices_data, +======= +static void cummax_cummin_helper(const T1* self_data, T1* values_data, T2* indices_data, +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) int self_dim_size, int self_stride, int values_stride, int indices_stride) { Operation op; T1 out = c10::load(self_data); @@ -1182,7 +1190,11 @@ std::vector gradient(const Tensor& self, IntArrayRef dim, int64_t edge_o // ALL REDUCE ################################################################# +<<<<<<< HEAD inline bool should_use_acc_buffer(at::TensorIterator& iter) { +======= +static inline bool should_use_acc_buffer(at::TensorIterator& iter) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) const auto ndim = iter.ndim(); if (!iter.device().is_cpu() || iter.noutputs() != 1) { return false; @@ -1244,7 +1256,11 @@ Tensor& sum_out(const Tensor& self, DimnameList dim, Tensor& nansum_out(const Tensor& self, at::OptionalIntArrayRef dim, bool keepdim, std::optional opt_dtype, Tensor& result) { if (self.device().is_cpu()) { +<<<<<<< HEAD TORCH_CHECK(!c10::isComplexType(self.scalar_type()), "nansum does not support complex inputs"); +======= + TORCH_CHECK(!c10::isComplexType(self.scalar_type()), "nansum on CPU does not support complex inputs"); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } // For integral types, use existing sum as @@ -1591,7 +1607,11 @@ Tensor norm(const Tensor& self, const Scalar& p) { return at::norm(self, p, IntArrayRef{}, false); } +<<<<<<< HEAD inline TensorIterator get_allany_iter( +======= +static inline TensorIterator get_allany_iter( +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) const Tensor& self, const Tensor& result, OptionalIntArrayRef dims, @@ -1608,7 +1628,11 @@ inline TensorIterator get_allany_iter( } template +<<<<<<< HEAD inline void allany_impl( +======= +static inline void allany_impl( +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) const Tensor& self, const Tensor& result, OptionalIntArrayRef dims, @@ -1653,7 +1677,11 @@ TORCH_IMPL_FUNC(any_all_out)(const Tensor& self, const Tensor& result) { } template +<<<<<<< HEAD Tensor allany_dims_default(const Tensor &self, OptionalIntArrayRef dim, bool keepdim) { +======= +static Tensor allany_dims_default(const Tensor &self, OptionalIntArrayRef dim, bool keepdim) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // Default implementation in terms of all-reduce or single dim reduce if (!dim) { Tensor out; @@ -1732,7 +1760,11 @@ TORCH_IMPL_FUNC(amax_out) (const Tensor& self, IntArrayRef dim, bool keepdim, co } template +<<<<<<< HEAD void argmax_argmin_impl( +======= +static void argmax_argmin_impl( +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) const Tensor& self, std::optional dim, bool keepdim, diff --git a/aten/src/ATen/native/Repeat.cpp b/aten/src/ATen/native/Repeat.cpp index fe1db473ea4c..8575a91534b8 100644 --- a/aten/src/ATen/native/Repeat.cpp +++ b/aten/src/ATen/native/Repeat.cpp @@ -74,7 +74,11 @@ Tensor repeat_interleave_symint( } Tensor repeats_ = repeats; +<<<<<<< HEAD if (repeats.dim() == 0 || (repeats.dim() == 1 && repeats.sym_size(0) == 1)) { +======= + if (repeats.dim() == 0 || (repeats.dim() == 1 && TORCH_GUARD_OR_FALSE(repeats.sym_size(0).sym_eq(1)))) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) repeats_ = repeats.reshape({1}).expand_symint({input.sym_size(dim.value())}); } else if (repeats.dim() == 1) { TORCH_CHECK( diff --git a/aten/src/ATen/native/Resize.cpp b/aten/src/ATen/native/Resize.cpp index a80b8eb52b61..9efaad95d76a 100644 --- a/aten/src/ATen/native/Resize.cpp +++ b/aten/src/ATen/native/Resize.cpp @@ -9,6 +9,10 @@ #include #else #include +<<<<<<< HEAD +======= +#include +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #include #include #include @@ -21,7 +25,11 @@ namespace at::native { // Returns true if resize is necessary template +<<<<<<< HEAD bool _resize_output_check(const Tensor& output, ArrayRef shape) { +======= +static bool _resize_output_check(const Tensor& output, ArrayRef shape) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // Tests for resizing of tensors with one or more elements if (at::symint::sizes(output).equals(shape)) { return false; @@ -56,7 +64,11 @@ static void native_resize_(const Tensor& output, SymIntArrayRef shape) { } template +<<<<<<< HEAD bool _resize_output(const Tensor& output, ArrayRef shape) { +======= +static bool _resize_output(const Tensor& output, ArrayRef shape) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if (_resize_output_check(output, shape)) { // avoid a redispatch for cpu and cuda. // TODO: when resize_cuda_ is re-written to be unified with resize_, @@ -196,7 +208,11 @@ static void _maybe_resize_storage(TensorImpl* self, c10::SymInt new_size_bytes) } template +<<<<<<< HEAD TensorImpl* _resize_impl_( +======= +static TensorImpl* _resize_impl_( +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) TensorImpl* self, ArrayRef size, at::OptionalArrayRef stride, @@ -234,7 +250,11 @@ TensorImpl* resize_impl_cpu_( } template +<<<<<<< HEAD const Tensor& _resize_( +======= +static const Tensor& _resize_( +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) const Tensor& self, ArrayRef size, std::optional optional_memory_format) { diff --git a/aten/src/ATen/native/Resize.h b/aten/src/ATen/native/Resize.h index 9111e4a08007..bb320a512983 100644 --- a/aten/src/ATen/native/Resize.h +++ b/aten/src/ATen/native/Resize.h @@ -101,7 +101,11 @@ inline void checkInBoundsForStorage( // It's ok to always evaluate to False for this early return for SymInts because // (1) maybe_convert_symint below only installs guard for int64_t case // (2) we check for this condition in the TORCH_MAYBE_SYM_CHECK below +<<<<<<< HEAD if (TORCH_GUARD_SIZE_OBLIVIOUS(sym_eq(storage_size_bytes, 0))) { +======= + if (TORCH_GUARD_OR_FALSE(sym_eq(storage_size_bytes, 0))) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // NB: (a tensor with arbitrary 0 dims)'s storage can have any numel. return; } @@ -138,7 +142,11 @@ inline void checkSetStorage(Tensor& result, Storage storage, T storage_offset, // storageOffset TORCH_CHECK( +<<<<<<< HEAD storage_offset >= 0, "Tensor: invalid storage offset ", storage_offset); +======= + TORCH_GUARD_OR_TRUE(sym_ge(storage_offset, 0)), "Tensor: invalid storage offset ", storage_offset); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // set_storage_{device} (except set_storage_meta__symint) // will (unsafely) set the storage offset and then call resize_impl that diff --git a/aten/src/ATen/native/ScatterGatherChecks.h b/aten/src/ATen/native/ScatterGatherChecks.h index 3a826a7a1b93..1333809e51be 100644 --- a/aten/src/ATen/native/ScatterGatherChecks.h +++ b/aten/src/ATen/native/ScatterGatherChecks.h @@ -19,8 +19,13 @@ inline void scatter_gather_dtype_check( ) { if (index.numel() != 0) { TORCH_CHECK( +<<<<<<< HEAD index.scalar_type() == at::ScalarType::Long, method_name, "(): Expected dtype int64 for index" +======= + index.scalar_type() == at::ScalarType::Long || index.scalar_type() == at::ScalarType::Int, + method_name, "(): Expected dtype int32/int64 for index" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ); } diff --git a/aten/src/ATen/native/SharedReduceOps.h b/aten/src/ATen/native/SharedReduceOps.h index edaa106fc83c..d3fb1b0fa87f 100644 --- a/aten/src/ATen/native/SharedReduceOps.h +++ b/aten/src/ATen/native/SharedReduceOps.h @@ -26,7 +26,11 @@ template inline C10_DEVICE scalar_t max_propagate_nan(scalar_t a, scalar_t b) { #if defined(__HIPCC__) // TODO: remove this special case for HIP when issue is fixed: +<<<<<<< HEAD // https://github.com/ROCm-Developer-Tools/HIP/issues/2209 +======= + // https://github.com/ROCm/hip/issues/2209 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) scalar_t max = at::_isnan(a) ? a : (at::_isnan(b) ? b : std::max(a, b)); #else scalar_t max = at::_isnan(b) ? b : std::max(a, b); @@ -37,7 +41,11 @@ template inline C10_DEVICE scalar_t min_propagate_nan(scalar_t a, scalar_t b) { #if defined(__HIPCC__) // TODO: remove this special case for HIP when issue is fixed: +<<<<<<< HEAD // https://github.com/ROCm-Developer-Tools/HIP/issues/2209 +======= + // https://github.com/ROCm/hip/issues/2209 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) scalar_t min = at::_isnan(a) ? a : (at::_isnan(b) ? b : std::min(a, b)); #else scalar_t min = at::_isnan(b) ? b : std::min(a, b); diff --git a/aten/src/ATen/native/SobolEngineOpsUtils.cpp b/aten/src/ATen/native/SobolEngineOpsUtils.cpp index 3d492221c505..5b0f6f7198ef 100644 --- a/aten/src/ATen/native/SobolEngineOpsUtils.cpp +++ b/aten/src/ATen/native/SobolEngineOpsUtils.cpp @@ -31,7 +31,11 @@ is present in the working directory). For additional details see [1]. # read in as dataframe, explicitly use zero values df = pd.DataFrame(rows).fillna(0).astype(int) +<<<<<<< HEAD # peform conversion according to Section 2.1 of [1] +======= + # perform conversion according to Section 2.1 of [1] +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) df["poly"] = 2 * df["a"] + 2 ** df["s"] + 1 # ensure columns are properly ordered diff --git a/aten/src/ATen/native/SoftMax.cpp b/aten/src/ATen/native/SoftMax.cpp index 92fc59f1c1e7..017b17aa3002 100644 --- a/aten/src/ATen/native/SoftMax.cpp +++ b/aten/src/ATen/native/SoftMax.cpp @@ -559,7 +559,11 @@ Tensor masked_softmax_cpu(const Tensor& input_, const Tensor& mask_, const std:: TORCH_CHECK((input_.sizes()[0] == mask.sizes()[0]) && (input_.sizes()[2] == mask.sizes()[1]), "For mask_type == 1 mask shape should be (B, L)"); if (dim_ != input_.dim() - 1) { +<<<<<<< HEAD // We only process padding mask in the optimized way if softmax is applied along the last dimesion, +======= + // We only process padding mask in the optimized way if softmax is applied along the last dimension, +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // otherwise we need to expand the mask into a generic 4D one mask = mask_.view({input_.sizes()[0], 1, 1, input_.sizes()[2]}); mask = mask.expand(input_.sizes()).contiguous(); @@ -570,7 +574,11 @@ Tensor masked_softmax_cpu(const Tensor& input_, const Tensor& mask_, const std:: TORCH_CHECK((mask.dim() == 2) && (input_.sizes()[2] == mask.sizes()[0]) && (input_.sizes()[2] == mask.sizes()[1]), "For mask_type == 0 mask shape should be (L, L)"); if (dim_ != input_.dim() - 1) { +<<<<<<< HEAD // We only process attention mask in a optimized way if softmax is applied along the last dimesion, +======= + // We only process attention mask in a optimized way if softmax is applied along the last dimension, +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // otherwise we need to expand the mask into a generic 4D one mask = mask.view({1, 1, input_.sizes()[2], input_.sizes()[2]}); mask = mask.expand(input_.sizes()).contiguous(); diff --git a/aten/src/ATen/native/SpectralOps.cpp b/aten/src/ATen/native/SpectralOps.cpp index 0658ed6f27bd..256eff2610c0 100644 --- a/aten/src/ATen/native/SpectralOps.cpp +++ b/aten/src/ATen/native/SpectralOps.cpp @@ -756,7 +756,11 @@ static DimVector default_alldims(const Tensor& self, at::OptionalIntArrayRef dim IntArrayRef dim_unwrapped = *dim_opt; dim.resize(dim_unwrapped.size()); for (const auto i : c10::irange(dim.size())) { +<<<<<<< HEAD dim[i] = maybe_wrap_dim(dim_unwrapped[i], self.dim(), /*wrap_scalars=*/false); +======= + dim[i] = maybe_wrap_dim(dim_unwrapped[i], self.dim(), /*wrap_scalar=*/false); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } } else { dim.resize(self.dim()); diff --git a/aten/src/ATen/native/TensorAdvancedIndexing.cpp b/aten/src/ATen/native/TensorAdvancedIndexing.cpp index d8d19afeeb3d..aeb5af2038ab 100644 --- a/aten/src/ATen/native/TensorAdvancedIndexing.cpp +++ b/aten/src/ATen/native/TensorAdvancedIndexing.cpp @@ -147,7 +147,10 @@ namespace at::native { +<<<<<<< HEAD std::string shapes_as_str(TensorList tensors); +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) AdvancedIndex make_info(Tensor self, IOptTensorListRef orig); } // namespace at::native @@ -176,9 +179,16 @@ TORCH_META_FUNC(gather) auto is_index_empty = index.numel() == 0; if (!is_index_empty) { TORCH_CHECK( +<<<<<<< HEAD index.scalar_type() == at::ScalarType::Long, "gather", "(): Expected dtype int64 for index"); +======= + index.scalar_type() == ScalarType::Long || + index.scalar_type() == ScalarType::Int, + "gather", + "(): Expected dtype int32/int64 for index"); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } if (is_index_empty) return; @@ -186,7 +196,11 @@ TORCH_META_FUNC(gather) } template +<<<<<<< HEAD void scatter_meta_impl( +======= +static void scatter_meta_impl( +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) Meta& meta, const Tensor& self, int64_t dim, @@ -358,7 +372,11 @@ TORCH_PRECOMPUTE_META_FUNC(index_copy) } template +<<<<<<< HEAD void index_func_meta_impl( +======= +static void index_func_meta_impl( +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) Meta& meta, const Tensor& self, int64_t dim, @@ -593,6 +611,7 @@ static bool all_strides_match(TensorList tensors) { return true; } +<<<<<<< HEAD inline std::string shapes_as_str(TensorList tensors) { std::ostringstream os; bool first = true; @@ -608,6 +627,8 @@ inline std::string shapes_as_str(TensorList tensors) { return os.str(); } +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // Replace indexed dimensions in src with stride 0 and the size of the result // tensor. The offset in these dimensions is computed by the kernel using the // index tensor's values and the stride of src. The new shape is not meaningful. @@ -1009,7 +1030,12 @@ Tensor& _index_put_impl_( } if ((self.device().type() == DeviceType::CUDA || self.device().type() == DeviceType::XPU) && +<<<<<<< HEAD (accumulate || globalContext().deterministicAlgorithms())) { +======= + (accumulate || + (globalContext().deterministicAlgorithms() && value_.numel() > 1))) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) TORCH_CHECK( value_.device() == self.device(), "expected device ", @@ -2249,7 +2275,11 @@ template < typename T, typename ReduceStub, typename FillStub> +<<<<<<< HEAD void scatter_impl( +======= +static void scatter_impl( +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) const Tensor& self, int64_t dim, const Tensor& index, @@ -2822,7 +2852,11 @@ Tensor _gather_sparse_backward( } template +<<<<<<< HEAD int64_t count_nonzero_impl(TensorIteratorBase& iter, Range range) { +======= +static int64_t count_nonzero_impl(TensorIteratorBase& iter, Range range) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) int64_t num_nonzero = 0; auto loop = [&](char** data, const int64_t* strides, int64_t n) { diff --git a/aten/src/ATen/native/TensorCompare.cpp b/aten/src/ATen/native/TensorCompare.cpp index f37376b5fc83..b42c2854c1cb 100644 --- a/aten/src/ATen/native/TensorCompare.cpp +++ b/aten/src/ATen/native/TensorCompare.cpp @@ -89,6 +89,19 @@ static inline void check_for_unsupported_isin_dtype(const ScalarType type) { type); } +<<<<<<< HEAD +======= +static inline void check_for_unsupported_clamp_dtypes(ScalarType dtype) { + TORCH_CHECK_NOT_IMPLEMENTED( + !isComplexType(dtype), "clamp is not supported for complex types"); +} + +static inline void check_for_unsupported_clamp_dtypes(const Scalar& s) { + TORCH_CHECK_NOT_IMPLEMENTED( + !s.isComplex(), "clamp is not supported for complex types"); +} + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) TORCH_META_FUNC(clamp) (const Tensor& self, const OptionalScalarRef min, const OptionalScalarRef max) { if (!min && !max) { @@ -96,9 +109,14 @@ TORCH_META_FUNC(clamp) false, "torch.clamp: At least one of 'min' or 'max' must not be None"); } // Manual type promotion, since scalars have to participate in it +<<<<<<< HEAD ScalarType result_type = self.scalar_type(); TORCH_CHECK( !isComplexType(result_type), "clamp is not supported for complex types"); +======= + auto result_type = self.scalar_type(); + check_for_unsupported_clamp_dtypes(result_type); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // Floating is the highest supported if (!isFloatingType(result_type)) { at::native::ResultTypeState state = {}; @@ -122,8 +140,12 @@ TORCH_META_FUNC(clamp) self.dtype()); } // make sure scalars weren't complex +<<<<<<< HEAD TORCH_CHECK( !isComplexType(result_type), "clamp is not supported for complex types"); +======= + check_for_unsupported_clamp_dtypes(result_type); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) build_unary_op(maybe_get_output(), self.to(result_type)); } @@ -132,9 +154,13 @@ TORCH_META_FUNC2(clamp, Tensor) TORCH_CHECK( min || max, "torch.clamp: At least one of 'min' or 'max' must not be None"); +<<<<<<< HEAD TORCH_CHECK( !isComplexType(self.scalar_type()), "clamp is not supported for complex types"); +======= + check_for_unsupported_clamp_dtypes(self.scalar_type()); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #define CLAMP_CONFIG() \ TensorIteratorConfig() \ .set_check_mem_overlap(true) \ @@ -157,10 +183,16 @@ TORCH_META_FUNC(clamp_max)(const Tensor& self, const Scalar& max) { // we could wrap max into tensor and send to tensor overload, // but relu is implemented via clamp_min, so for perf an uniformity reasons // do a faster but correct thing +<<<<<<< HEAD ScalarType result_type = self.scalar_type(); TORCH_CHECK( !isComplexType(result_type), "clamp is not supported for complex types"); TORCH_CHECK(!max.isComplex(), "clamp is not supported for complex types"); +======= + auto result_type = self.scalar_type(); + check_for_unsupported_clamp_dtypes(result_type); + check_for_unsupported_clamp_dtypes(max); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // Floating is the highest supported if (!isFloatingType(result_type)) { auto result_type = at::native::result_type(self, max); @@ -183,10 +215,16 @@ TORCH_META_FUNC2(clamp_max, Tensor)(const Tensor& self, const Tensor& max) { } TORCH_META_FUNC(clamp_min)(const Tensor& self, const Scalar& min) { +<<<<<<< HEAD ScalarType result_type = self.scalar_type(); TORCH_CHECK( !isComplexType(result_type), "clamp is not supported for complex types"); TORCH_CHECK(!min.isComplex(), "clamp is not supported for complex types"); +======= + auto result_type = self.scalar_type(); + check_for_unsupported_clamp_dtypes(result_type); + check_for_unsupported_clamp_dtypes(min); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // Floating is the highest supported if (!isFloatingType(result_type)) { auto result_type = at::native::result_type(self, min); @@ -485,13 +523,21 @@ void _assert_async_cpu(const Tensor& self) { void _assert_async_msg_cpu(const Tensor& self, std::string_view assert_msg) { TORCH_CHECK( native::is_nonzero(self), +<<<<<<< HEAD assert_msg != "" ? assert_msg : "Assertion is failed"); +======= + !assert_msg.empty() ? assert_msg : "Assertion is failed"); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } void _assert_scalar(const Scalar& scalar, std::string_view assert_msg) { TORCH_SYM_CHECK( scalar.toSymBool(), +<<<<<<< HEAD assert_msg != "" ? assert_msg : "Assertion is failed"); +======= + !assert_msg.empty() ? assert_msg : "Assertion is failed"); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } Tensor _functional_assert_scalar( @@ -569,7 +615,11 @@ static void isin_sorting( } template +<<<<<<< HEAD Device out_device(Args&... inps) { +======= +static Device out_device(Args&... inps) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) for (const auto& i : {inps...}) { if (i.device() != at::kCPU) { return i.device(); @@ -739,7 +789,11 @@ std::tuple mode_out( } template +<<<<<<< HEAD void minmax_out_impl( +======= +static void minmax_out_impl( +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) const Tensor& self, int64_t dim, bool keepdim, diff --git a/aten/src/ATen/native/TensorConversions.cpp b/aten/src/ATen/native/TensorConversions.cpp index 3a60eddbe8fc..690367b0409e 100644 --- a/aten/src/ATen/native/TensorConversions.cpp +++ b/aten/src/ATen/native/TensorConversions.cpp @@ -806,7 +806,11 @@ Tensor sparse_compressed_to_dense( // Computes the strides for view_dtype output when the view dtype is // smaller than the original dtype +<<<<<<< HEAD inline SymDimVector compute_strides_for_view_dtype_downsize( +======= +static inline SymDimVector compute_strides_for_view_dtype_downsize( +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) SymIntArrayRef old_strides, int64_t size_ratio, ScalarType old_dtype, @@ -832,7 +836,11 @@ inline SymDimVector compute_strides_for_view_dtype_downsize( // Computes the strides for view_dtype output when the view dtype is // larger than the original dtype +<<<<<<< HEAD inline SymDimVector compute_strides_for_view_dtype_upsize( +======= +static inline SymDimVector compute_strides_for_view_dtype_upsize( +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) SymIntArrayRef old_strides, int64_t size_ratio, ScalarType old_dtype, @@ -1013,6 +1021,7 @@ static Tensor _batch_tile_tensor( static Tensor _mask_to_indices(const Tensor& mask) { // This function returns a vector of the indices at which given +<<<<<<< HEAD // boolean mask is True. at::nonzero can achieve the same, but // we yet have to compare the performance difference. TORCH_CHECK( @@ -1039,6 +1048,25 @@ static std::pair _not_zero_mask_to_col_row_indices( .expand_as(not_zero_mask) .masked_select(not_zero_mask); return std::pair(col_indices, row_indices); +======= + // boolean mask is True. Here at::nonzero performs test (time/mem). + TORCH_INTERNAL_ASSERT_DEBUG_ONLY( + mask.dim() == 1, "_mask_to_indices only supports 1-d masks."); + TORCH_INTERNAL_ASSERT_DEBUG_ONLY( + mask.dtype() == at::kBool, "Expected mask to be of dtype bool."); + return at::native::flatten(at::nonzero(mask)); +} + +static std::pair _not_zero_mask_to_col_row_indices( + Tensor not_zero_mask) { + TORCH_INTERNAL_ASSERT_DEBUG_ONLY( + not_zero_mask.dim() == 2, + "_not_zero_mask_to_col_row_indices only supports 2-d masks."); + TORCH_INTERNAL_ASSERT_DEBUG_ONLY( + not_zero_mask.dtype() == at::kBool, "Expected mask to be of dtype bool."); + auto nz = not_zero_mask.nonzero(); + return {nz.select(1, 1), nz.select(1, 0)}; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } // Sparse layout conversions Start @@ -1319,8 +1347,13 @@ static Tensor dense_to_sparse_compressed( Tensor col_indices; Tensor compressed_indices; if (compressed_rows_layout) { +<<<<<<< HEAD std::tie(col_indices, row_indices) = _not_zero_mask_to_col_row_indices( not_zero_mask, at::kLong, not_zero_mask.device()); +======= + std::tie(col_indices, row_indices) = + _not_zero_mask_to_col_row_indices(not_zero_mask); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) compressed_indices = at::_convert_indices_from_coo_to_csr( row_indices, not_zero_mask.size(0), false /*out_int32*/); { @@ -1328,8 +1361,13 @@ static Tensor dense_to_sparse_compressed( values = values.flatten(0, 1).index_select(0, mask_indices); } } else { +<<<<<<< HEAD std::tie(row_indices, col_indices) = _not_zero_mask_to_col_row_indices( not_zero_mask.transpose(1, 0), at::kLong, not_zero_mask.device()); +======= + std::tie(row_indices, col_indices) = + _not_zero_mask_to_col_row_indices(not_zero_mask.transpose(1, 0)); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) compressed_indices = at::_convert_indices_from_coo_to_csr( col_indices, not_zero_mask.size(-1), false /*out_int32*/); { @@ -1708,7 +1746,11 @@ static Tensor sparse_compressed_to_flipped( // Step 4: // Convert the COO indices to the CSC/BSC indices and form the output. +<<<<<<< HEAD // We need to sort COO indices along the "tranposed" dim to satisfy the +======= + // We need to sort COO indices along the "transposed" dim to satisfy the +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // invariant of sorted plain indices. // Hash coo indices by converting 2d indices to linear offsets with // more "weight" (aka stride) placed on the "transposed" dimension. @@ -1989,7 +2031,11 @@ TORCH_IMPL_FUNC(_convert_indices_from_csr_to_coo_structured_cpu) * Modified to ensure sorted BSR column indices. */ template +<<<<<<< HEAD void _compressed_to_block_compressed_cpu_kernel( +======= +static void _compressed_to_block_compressed_cpu_kernel( +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) const index_t n_compressed, // Tensor size along compressed dimension const index_t n_plain, // Tensor size along plain dimension const index_t C, // Block size along compressed dimensions @@ -2086,7 +2132,11 @@ void _compressed_to_block_compressed_cpu_kernel( * https://github.com/scipy/scipy/blob/8a64c938ddf1ae4c02a08d2c5e38daeb8d061d38/scipy/sparse/sparsetools/csr.h */ template +<<<<<<< HEAD index_t compressed_count_blocks( +======= +static index_t compressed_count_blocks( +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) const index_t n_compressed, // Tensor size along compressed dimension const index_t n_plain, // Tensor size along plain dimension const index_t C, // Block size along compressed dimensions @@ -2110,7 +2160,11 @@ index_t compressed_count_blocks( } template +<<<<<<< HEAD Tensor _compressed_to_block_compressed_cpu( +======= +static Tensor _compressed_to_block_compressed_cpu( +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) const Tensor& self, IntArrayRef blocksize) { static_assert( diff --git a/aten/src/ATen/native/TensorFactories.cpp b/aten/src/ATen/native/TensorFactories.cpp index b87e7142ea08..a9969ba462a8 100644 --- a/aten/src/ATen/native/TensorFactories.cpp +++ b/aten/src/ATen/native/TensorFactories.cpp @@ -1214,6 +1214,31 @@ Tensor randint_like( Tensor randint_like( const Tensor& self, +<<<<<<< HEAD +======= + const Tensor& high, + std::optional dtype, + std::optional layout, + std::optional device, + std::optional pin_memory, + std::optional optional_memory_format) { + TORCH_CHECK( + high.numel() == 1 && high.ndimension() == 0 && high.device().is_cpu(), + "high must be a scalar tensor and on CPU"); + int64_t high_scalar = high.item(); + return at::native::randint_like( + self, + high_scalar, + dtype, + layout, + device, + pin_memory, + optional_memory_format); +} + +Tensor randint_like( + const Tensor& self, +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) int64_t low, int64_t high, std::optional dtype, @@ -2072,22 +2097,40 @@ Tensor vander(const Tensor& x, std::optional N, bool increasing) { // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ tensor ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ template +<<<<<<< HEAD Tensor tensor_cpu(ArrayRef values, const TensorOptions& options) { +======= +static Tensor tensor_cpu(ArrayRef values, const TensorOptions& options) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return at::detail::tensor_cpu(values, options); } template +<<<<<<< HEAD Tensor tensor_backend(ArrayRef values, const TensorOptions& options) { +======= +static Tensor tensor_backend(ArrayRef values, const TensorOptions& options) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return at::detail::tensor_backend(values, options); } template +<<<<<<< HEAD Tensor tensor_complex_cpu(ArrayRef values, const TensorOptions& options) { +======= +static Tensor tensor_complex_cpu( + ArrayRef values, + const TensorOptions& options) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return at::detail::tensor_complex_cpu(values, options); } template +<<<<<<< HEAD Tensor tensor_complex_backend( +======= +static Tensor tensor_complex_backend( +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ArrayRef values, const TensorOptions& options) { return at::detail::tensor_complex_backend(values, options); diff --git a/aten/src/ATen/native/TensorShape.cpp b/aten/src/ATen/native/TensorShape.cpp index c66ff757641b..b6eea2963e38 100644 --- a/aten/src/ATen/native/TensorShape.cpp +++ b/aten/src/ATen/native/TensorShape.cpp @@ -24,6 +24,10 @@ #include #include #include +<<<<<<< HEAD +======= +#include +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #include #include #include @@ -216,7 +220,11 @@ namespace at::meta { +<<<<<<< HEAD inline c10::MemoryFormat cat_compute_output_memory_format( +======= +static inline c10::MemoryFormat cat_compute_output_memory_format( +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) const MaterializedITensorListRef& inputs) { std::optional format = std::nullopt; for (const Tensor& t : inputs) { @@ -430,7 +438,11 @@ Tensor& set_storage_meta__symint( size, stride, storage_offset); // Matches maybe_resize_storage_cpu no-numel behavior +<<<<<<< HEAD if (TORCH_GUARD_SIZE_OBLIVIOUS(result.sym_numel().sym_ne(0))) { +======= + if (TORCH_GUARD_OR_TRUE(result.sym_numel().sym_ne(0))) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // maybe_resize_storage_cpu can handle no storage exists at all but // that should never be the case here TORCH_INTERNAL_ASSERT(storage); @@ -439,12 +451,16 @@ Tensor& set_storage_meta__symint( // All meta data pointers are the same, so we don't have to "re" allocate // it. TODO: Actually this might not quite be correct if we use special // pointers to track whether or not fake cuda tensors are pinned or not +<<<<<<< HEAD const auto itemsize = result.dtype().itemsize(); c10::SymInt new_size_bytes = result.is_contiguous() ? at::detail::computeStorageNbytesContiguous( size, itemsize, std::move(storage_offset)) : at::detail::computeStorageNbytes( size, stride, itemsize, std::move(storage_offset)); +======= + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // TODO: When there are unbacked SymInts, we unconditionally skip the // setter. This is technically wrong, but we cannot conveniently test // the real condition in many cases, because a lot of people are using @@ -453,10 +469,27 @@ Tensor& set_storage_meta__symint( // // The old behavior was to unconditionally set_nbytes, but I think not // setting it is more safe. +<<<<<<< HEAD if (new_size_bytes.has_hint() && storage.sym_nbytes().has_hint() && TORCH_GUARD_SIZE_OBLIVIOUS( new_size_bytes.sym_gt(storage.sym_nbytes()))) { storage.set_nbytes(std::move(new_size_bytes)); +======= + if (result.sym_numel().has_hint()) { + const auto itemsize = result.dtype().itemsize(); + + c10::SymInt new_size_bytes = result.is_contiguous() + ? at::detail::computeStorageNbytesContiguous( + size, itemsize, std::move(storage_offset)) + : at::detail::computeStorageNbytes( + size, stride, itemsize, std::move(storage_offset)); + + if (new_size_bytes.has_hint() && storage.sym_nbytes().has_hint() && + TORCH_GUARD_SIZE_OBLIVIOUS( + new_size_bytes.sym_gt(storage.sym_nbytes()))) { + storage.set_nbytes(std::move(new_size_bytes)); + } +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } } return result; @@ -572,7 +605,11 @@ Tensor sparse_broadcast_to(const Tensor& self, IntArrayRef size) { // } // Then define for each sparse dim the number of reps for each nnz index/value +<<<<<<< HEAD // due to broadcasting. Repetitions do not take into accout the current value +======= + // due to broadcasting. Repetitions do not take into account the current value +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // of nnz - this will be taken care of later { auto nnz_repeats = c10::DimVector(res_sparse_dim); nnz_repeats.back() = res_sparse_dim_broadcast_mask.back(); @@ -757,22 +794,38 @@ TORCH_IMPL_FUNC(cat_out_cpu) } Tensor& cat_out(TensorList tensors, Dimname dim, Tensor& result) { +<<<<<<< HEAD TORCH_CHECK(!tensors.empty(), "expected a non-empty list of Tensors"); +======= + TORCH_CHECK_VALUE(!tensors.empty(), "expected a non-empty list of Tensors"); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return at::cat_out(result, tensors, dimname_to_position(tensors[0], dim)); } Tensor cat(TensorList tensors, Dimname dim) { +<<<<<<< HEAD TORCH_CHECK(!tensors.empty(), "expected a non-empty list of Tensors"); +======= + TORCH_CHECK_VALUE(!tensors.empty(), "expected a non-empty list of Tensors"); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return at::cat(tensors, dimname_to_position(tensors[0], dim)); } // torch.concat, alias for torch.cat Tensor& concat_out(TensorList tensors, Dimname dim, Tensor& result) { +<<<<<<< HEAD return at::cat_out(result, tensors, dimname_to_position(tensors[0], dim)); } Tensor concat(TensorList tensors, Dimname dim) { return at::cat(tensors, dimname_to_position(tensors[0], dim)); +======= + return cat_out(tensors, dim, result); +} + +Tensor concat(TensorList tensors, Dimname dim) { + return at::cat(tensors, dim); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } Tensor& concat_out(TensorList tensors, int64_t dim, Tensor& result) { @@ -785,11 +838,19 @@ Tensor concat(TensorList tensors, int64_t dim) { // torch.concatenate, alias for torch.cat Tensor& concatenate_out(TensorList tensors, Dimname dim, Tensor& result) { +<<<<<<< HEAD return at::cat_out(result, tensors, dimname_to_position(tensors[0], dim)); } Tensor concatenate(TensorList tensors, Dimname dim) { return at::cat(tensors, dimname_to_position(tensors[0], dim)); +======= + return cat_out(tensors, dim, result); +} + +Tensor concatenate(TensorList tensors, Dimname dim) { + return at::cat(tensors, dim); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } Tensor& concatenate_out(TensorList tensors, int64_t dim, Tensor& result) { @@ -1119,7 +1180,11 @@ std::vector tensor_split_sections_symint( } template +<<<<<<< HEAD std::vector _tensor_split_indices( +======= +static std::vector _tensor_split_indices( +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) const Tensor& self, ArrayRef indices, int64_t dim) { @@ -1417,7 +1482,11 @@ Tensor as_strided_tensorimpl( } template +<<<<<<< HEAD inline void setStridedUnchecked( +======= +static inline void setStridedUnchecked( +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) const Tensor& self, ArrayRef size, ArrayRef stride, @@ -1922,7 +1991,11 @@ Tensor tile_symint(const Tensor& self, SymIntArrayRef reps) { // templated for ArrayRef and SmallVector use cases // template +<<<<<<< HEAD Tensor alias_with_sizes_and_strides( +======= +static Tensor alias_with_sizes_and_strides( +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) const Tensor& self, const Vec& sizes, const Vec& strides) { @@ -1958,7 +2031,11 @@ Tensor alias_with_sizes_and_strides( // SymIntArrayRef/ArrayRef and // SmallVector/SymDimVector template